xref: /aosp_15_r20/external/mesa3d/src/freedreno/vulkan/tu_pipeline.cc (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  * SPDX-License-Identifier: MIT
5  *
6  * based in part on anv driver which is:
7  * Copyright © 2015 Intel Corporation
8  */
9 
10 #include "tu_pipeline.h"
11 
12 #include "common/freedreno_guardband.h"
13 
14 #include "ir3/ir3_nir.h"
15 #include "nir/nir.h"
16 #include "nir/nir_builder.h"
17 #include "nir/nir_serialize.h"
18 #include "spirv/nir_spirv.h"
19 #include "util/u_debug.h"
20 #include "util/mesa-sha1.h"
21 #include "vk_nir.h"
22 #include "vk_pipeline.h"
23 #include "vk_render_pass.h"
24 #include "vk_util.h"
25 
26 #include "tu_cmd_buffer.h"
27 #include "tu_cs.h"
28 #include "tu_device.h"
29 #include "tu_knl.h"
30 #include "tu_formats.h"
31 #include "tu_lrz.h"
32 #include "tu_pass.h"
33 #include "tu_rmv.h"
34 
35 /* Emit IB that preloads the descriptors that the shader uses */
36 
37 static void
emit_load_state(struct tu_cs * cs,unsigned opcode,enum a6xx_state_type st,enum a6xx_state_block sb,unsigned base,unsigned offset,unsigned count)38 emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st,
39                 enum a6xx_state_block sb, unsigned base, unsigned offset,
40                 unsigned count)
41 {
42    /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not
43     * clear if emitting more packets will even help anything. Presumably the
44     * descriptor cache is relatively small, and these packets stop doing
45     * anything when there are too many descriptors.
46     */
47    tu_cs_emit_pkt7(cs, opcode, 3);
48    tu_cs_emit(cs,
49               CP_LOAD_STATE6_0_STATE_TYPE(st) |
50               CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) |
51               CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
52               CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1)));
53    tu_cs_emit_qw(cs, offset | (base << 28));
54 }
55 
56 static unsigned
tu6_load_state_size(struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout)57 tu6_load_state_size(struct tu_pipeline *pipeline,
58                     struct tu_pipeline_layout *layout)
59 {
60    const unsigned load_state_size = 4;
61    unsigned size = 0;
62    for (unsigned i = 0; i < layout->num_sets; i++) {
63       if (!(pipeline->active_desc_sets & (1u << i)))
64          continue;
65 
66       struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
67       for (unsigned j = 0; j < set_layout->binding_count; j++) {
68          struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
69          unsigned count = 0;
70          /* See comment in tu6_emit_load_state(). */
71          VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
72          unsigned stage_count = util_bitcount(stages);
73 
74          if (!binding->array_size)
75             continue;
76 
77          switch (binding->type) {
78          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
79          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
80          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
81          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
82             /* IBO-backed resources only need one packet for all graphics stages */
83             if (stage_count)
84                count += 1;
85             break;
86          case VK_DESCRIPTOR_TYPE_SAMPLER:
87          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
88          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
89          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
90          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
91             /* Textures and UBO's needs a packet for each stage */
92             count = stage_count;
93             break;
94          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
95             /* Because of how we pack combined images and samplers, we
96              * currently can't use one packet for the whole array.
97              */
98             count = stage_count * binding->array_size * 2;
99             break;
100          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
101          case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
102          case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
103             break;
104          default:
105             unreachable("bad descriptor type");
106          }
107          size += count * load_state_size;
108       }
109    }
110    return size;
111 }
112 
113 static void
tu6_emit_load_state(struct tu_device * device,struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout)114 tu6_emit_load_state(struct tu_device *device,
115                     struct tu_pipeline *pipeline,
116                     struct tu_pipeline_layout *layout)
117 {
118    unsigned size = tu6_load_state_size(pipeline, layout);
119    if (size == 0)
120       return;
121 
122    struct tu_cs cs;
123    tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);
124 
125    for (unsigned i = 0; i < layout->num_sets; i++) {
126       /* From 13.2.7. Descriptor Set Binding:
127        *
128        *    A compatible descriptor set must be bound for all set numbers that
129        *    any shaders in a pipeline access, at the time that a draw or
130        *    dispatch command is recorded to execute using that pipeline.
131        *    However, if none of the shaders in a pipeline statically use any
132        *    bindings with a particular set number, then no descriptor set need
133        *    be bound for that set number, even if the pipeline layout includes
134        *    a non-trivial descriptor set layout for that set number.
135        *
136        * This means that descriptor sets unused by the pipeline may have a
137        * garbage or 0 BINDLESS_BASE register, which will cause context faults
138        * when prefetching descriptors from these sets. Skip prefetching for
139        * descriptors from them to avoid this. This is also an optimization,
140        * since these prefetches would be useless.
141        */
142       if (!(pipeline->active_desc_sets & (1u << i)))
143          continue;
144 
145       struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
146       for (unsigned j = 0; j < set_layout->binding_count; j++) {
147          struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
148          unsigned base = i;
149          unsigned offset = binding->offset / 4;
150          /* Note: amber sets VK_SHADER_STAGE_ALL for its descriptor layout, and
151           * zink has descriptors for each stage in the push layout even if some
152           * stages aren't present in a used pipeline.  We don't want to emit
153           * loads for unused descriptors.
154           */
155          VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
156          unsigned count = binding->array_size;
157 
158          /* If this is a variable-count descriptor, then the array_size is an
159           * upper bound on the size, but we don't know how many descriptors
160           * will actually be used. Therefore we can't pre-load them here.
161           */
162          if (j == set_layout->binding_count - 1 &&
163              set_layout->has_variable_descriptors)
164             continue;
165 
166          if (count == 0 || stages == 0)
167             continue;
168          switch (binding->type) {
169          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
170             assert(device->physical_device->reserved_set_idx >= 0);
171             base = device->physical_device->reserved_set_idx;
172             offset = (pipeline->program.dynamic_descriptor_offsets[i] +
173                       binding->dynamic_offset_offset) / 4;
174             FALLTHROUGH;
175          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
176          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
177          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: {
178             unsigned mul = binding->size / (A6XX_TEX_CONST_DWORDS * 4);
179             /* IBO-backed resources only need one packet for all graphics stages */
180             if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) {
181                emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO,
182                                base, offset, count * mul);
183             }
184             if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
185                emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER,
186                                base, offset, count * mul);
187             }
188             break;
189          }
190          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
191          case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
192          case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
193             /* nothing - input attachments and inline uniforms don't use bindless */
194             break;
195          case VK_DESCRIPTOR_TYPE_SAMPLER:
196          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
197          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
198             tu_foreach_stage(stage, stages) {
199                emit_load_state(&cs, tu6_stage2opcode(stage),
200                                binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ?
201                                ST6_SHADER : ST6_CONSTANTS,
202                                tu6_stage2texsb(stage), base, offset, count);
203             }
204             break;
205          }
206          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
207             assert(device->physical_device->reserved_set_idx >= 0);
208             base = device->physical_device->reserved_set_idx;
209             offset = (pipeline->program.dynamic_descriptor_offsets[i] +
210                       binding->dynamic_offset_offset) / 4;
211             FALLTHROUGH;
212          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: {
213             tu_foreach_stage(stage, stages) {
214                emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO,
215                                tu6_stage2shadersb(stage), base, offset, count);
216             }
217             break;
218          }
219          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
220             tu_foreach_stage(stage, stages) {
221                /* TODO: We could emit less CP_LOAD_STATE6 if we used
222                 * struct-of-arrays instead of array-of-structs.
223                 */
224                for (unsigned i = 0; i < count; i++) {
225                   unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS;
226                   unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS;
227                   emit_load_state(&cs, tu6_stage2opcode(stage),
228                                   ST6_CONSTANTS, tu6_stage2texsb(stage),
229                                   base, tex_offset, 1);
230                   emit_load_state(&cs, tu6_stage2opcode(stage),
231                                   ST6_SHADER, tu6_stage2texsb(stage),
232                                   base, sam_offset, 1);
233                }
234             }
235             break;
236          }
237          default:
238             unreachable("bad descriptor type");
239          }
240       }
241    }
242 
243    pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs);
244 }
245 
246 struct tu_pipeline_builder
247 {
248    struct tu_device *device;
249    void *mem_ctx;
250    struct vk_pipeline_cache *cache;
251    const VkAllocationCallbacks *alloc;
252    const VkGraphicsPipelineCreateInfo *create_info;
253    VkPipelineCreateFlags2KHR create_flags;
254 
255    struct tu_pipeline_layout layout;
256 
257    struct tu_pvtmem_config pvtmem;
258 
259    bool rasterizer_discard;
260    /* these states are affectd by rasterizer_discard */
261    uint8_t unscaled_input_fragcoord;
262 
263    /* Each library defines at least one piece of state in
264     * VkGraphicsPipelineLibraryFlagsEXT, and libraries cannot overlap, so
265     * there can be at most as many libraries as pieces of state, of which
266     * there are currently 4.
267     */
268 #define MAX_LIBRARIES 4
269 
270    unsigned num_libraries;
271    struct tu_graphics_lib_pipeline *libraries[MAX_LIBRARIES];
272 
273    /* This is just the state that we are compiling now, whereas the final
274     * pipeline will include the state from the libraries.
275     */
276    VkGraphicsPipelineLibraryFlagsEXT state;
277 
278    /* The stages we are compiling now. */
279    VkShaderStageFlags active_stages;
280 
281    bool fragment_density_map;
282 
283    struct vk_graphics_pipeline_all_state all_state;
284    struct vk_graphics_pipeline_state graphics_state;
285 };
286 
287 static bool
tu_logic_op_reads_dst(VkLogicOp op)288 tu_logic_op_reads_dst(VkLogicOp op)
289 {
290    switch (op) {
291    case VK_LOGIC_OP_CLEAR:
292    case VK_LOGIC_OP_COPY:
293    case VK_LOGIC_OP_COPY_INVERTED:
294    case VK_LOGIC_OP_SET:
295       return false;
296    default:
297       return true;
298    }
299 }
300 
301 static bool
tu_blend_state_is_dual_src(const struct vk_color_blend_state * cb)302 tu_blend_state_is_dual_src(const struct vk_color_blend_state *cb)
303 {
304    for (unsigned i = 0; i < cb->attachment_count; i++) {
305       if (tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].src_color_blend_factor) ||
306           tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].dst_color_blend_factor) ||
307           tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].src_alpha_blend_factor) ||
308           tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].dst_alpha_blend_factor))
309          return true;
310    }
311 
312    return false;
313 }
314 
315 enum ir3_push_consts_type
tu_push_consts_type(const struct tu_pipeline_layout * layout,const struct ir3_compiler * compiler)316 tu_push_consts_type(const struct tu_pipeline_layout *layout,
317                     const struct ir3_compiler *compiler)
318 {
319    if (!layout->push_constant_size)
320       return IR3_PUSH_CONSTS_NONE;
321 
322    if (TU_DEBUG(PUSH_CONSTS_PER_STAGE))
323       return IR3_PUSH_CONSTS_PER_STAGE;
324 
325    if (tu6_shared_constants_enable(layout, compiler)) {
326       return IR3_PUSH_CONSTS_SHARED;
327    } else {
328       if (compiler->gen >= 7) {
329          return IR3_PUSH_CONSTS_SHARED_PREAMBLE;
330       } else {
331          return IR3_PUSH_CONSTS_PER_STAGE;
332       }
333    }
334 }
335 
336 template <chip CHIP>
337 struct xs_config {
338    uint16_t reg_sp_xs_config;
339    uint16_t reg_hlsq_xs_ctrl;
340 };
341 
342 template <chip CHIP>
343 static const xs_config<CHIP> xs_configs[] = {
344    [MESA_SHADER_VERTEX] = {
345       REG_A6XX_SP_VS_CONFIG,
346       CHIP == A6XX ? REG_A6XX_HLSQ_VS_CNTL : REG_A7XX_HLSQ_VS_CNTL,
347    },
348    [MESA_SHADER_TESS_CTRL] = {
349       REG_A6XX_SP_HS_CONFIG,
350       CHIP == A6XX ? REG_A6XX_HLSQ_HS_CNTL : REG_A7XX_HLSQ_HS_CNTL,
351    },
352    [MESA_SHADER_TESS_EVAL] = {
353       REG_A6XX_SP_DS_CONFIG,
354       CHIP == A6XX ? REG_A6XX_HLSQ_DS_CNTL : REG_A7XX_HLSQ_DS_CNTL,
355    },
356    [MESA_SHADER_GEOMETRY] = {
357       REG_A6XX_SP_GS_CONFIG,
358       CHIP == A6XX ? REG_A6XX_HLSQ_GS_CNTL : REG_A7XX_HLSQ_GS_CNTL,
359    },
360    [MESA_SHADER_FRAGMENT] = {
361       REG_A6XX_SP_FS_CONFIG,
362       CHIP == A6XX ? REG_A6XX_HLSQ_FS_CNTL : REG_A7XX_HLSQ_FS_CNTL,
363    },
364    [MESA_SHADER_COMPUTE] = {
365       REG_A6XX_SP_CS_CONFIG,
366       CHIP == A6XX ? REG_A6XX_HLSQ_CS_CNTL : REG_A7XX_HLSQ_CS_CNTL,
367    },
368 };
369 
370 template <chip CHIP>
371 void
tu6_emit_xs_config(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs)372 tu6_emit_xs_config(struct tu_cs *cs,
373                    gl_shader_stage stage, /* xs->type, but xs may be NULL */
374                    const struct ir3_shader_variant *xs)
375 {
376    const struct xs_config<CHIP> *cfg = &xs_configs<CHIP>[stage];
377 
378    if (!xs) {
379       /* shader stage disabled */
380       tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
381       tu_cs_emit(cs, 0);
382 
383       tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
384       tu_cs_emit(cs, 0);
385       return;
386    }
387 
388    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
389    tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED |
390                   COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
391                   COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
392                   COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
393                   COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
394                   A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) |
395                   A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp));
396 
397    tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
398    tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) |
399                      A6XX_HLSQ_VS_CNTL_ENABLED |
400                      COND(xs->shader_options.push_consts_type == IR3_PUSH_CONSTS_SHARED_PREAMBLE,
401                           A7XX_HLSQ_VS_CNTL_READ_IMM_SHARED_CONSTS));
402 }
403 TU_GENX(tu6_emit_xs_config);
404 
405 static void
tu6_emit_dynamic_offset(struct tu_cs * cs,const struct ir3_shader_variant * xs,const struct tu_shader * shader,const struct tu_program_state * program)406 tu6_emit_dynamic_offset(struct tu_cs *cs,
407                         const struct ir3_shader_variant *xs,
408                         const struct tu_shader *shader,
409                         const struct tu_program_state *program)
410 {
411    const struct tu_physical_device *phys_dev = cs->device->physical_device;
412 
413    if (!xs)
414       return;
415 
416    if (cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
417       if (shader->const_state.dynamic_offsets_ubo.size == 0)
418          return;
419 
420       uint32_t offsets[MAX_SETS];
421       for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
422          unsigned dynamic_offset_start =
423             program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4);
424          offsets[i] = dynamic_offset_start;
425       }
426 
427       /* A7XX TODO: Emit data via sub_cs instead of NOP */
428       uint64_t iova = tu_cs_emit_data_nop(cs, offsets, phys_dev->usable_sets, 4);
429       uint32_t offset = shader->const_state.dynamic_offsets_ubo.idx;
430 
431       tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 5);
432       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
433                CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
434                CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
435                CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
436                CP_LOAD_STATE6_0_NUM_UNIT(1));
437       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
438       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
439       int size_vec4s = DIV_ROUND_UP(phys_dev->usable_sets, 4);
440       tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
441    } else {
442       if (shader->const_state.dynamic_offset_loc == UINT32_MAX)
443          return;
444 
445       tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + phys_dev->usable_sets);
446       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(shader->const_state.dynamic_offset_loc / 4) |
447                CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
448                CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
449                CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
450                CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(phys_dev->usable_sets, 4)));
451       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
452       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
453 
454       for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
455          unsigned dynamic_offset_start =
456             program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4);
457          tu_cs_emit(cs, dynamic_offset_start);
458       }
459    }
460 }
461 
462 template <chip CHIP>
463 void
tu6_emit_shared_consts_enable(struct tu_cs * cs,bool enable)464 tu6_emit_shared_consts_enable(struct tu_cs *cs, bool enable)
465 {
466    if (CHIP == A6XX) {
467       /* Enable/disable shared constants */
468       tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = enable));
469    } else {
470       assert(!enable);
471    }
472 
473    tu_cs_emit_regs(cs, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true,
474                                             .isammode = ISAMMODE_GL,
475                                             .shared_consts_enable = enable));
476 }
477 TU_GENX(tu6_emit_shared_consts_enable);
478 
479 template <chip CHIP>
480 static void
tu6_setup_streamout(struct tu_cs * cs,const struct ir3_shader_variant * v,const struct ir3_shader_linkage * l)481 tu6_setup_streamout(struct tu_cs *cs,
482                     const struct ir3_shader_variant *v,
483                     const struct ir3_shader_linkage *l)
484 {
485    const struct ir3_stream_output_info *info = &v->stream_output;
486    /* Note: 64 here comes from the HW layout of the program RAM. The program
487     * for stream N is at DWORD 64 * N.
488     */
489 #define A6XX_SO_PROG_DWORDS 64
490    uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {};
491    BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0};
492 
493    /* TODO: streamout state should be in a non-GMEM draw state */
494 
495    /* no streamout: */
496    if (info->num_outputs == 0) {
497       unsigned sizedw = 4;
498       if (cs->device->physical_device->info->a6xx.tess_use_shared)
499          sizedw += 2;
500 
501       tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, sizedw);
502       tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
503       tu_cs_emit(cs, 0);
504       tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
505       tu_cs_emit(cs, 0);
506 
507       if (cs->device->physical_device->info->a6xx.tess_use_shared) {
508          tu_cs_emit(cs, REG_A6XX_PC_SO_STREAM_CNTL);
509          tu_cs_emit(cs, 0);
510       }
511 
512       return;
513    }
514 
515    for (unsigned i = 0; i < info->num_outputs; i++) {
516       const struct ir3_stream_output *out = &info->output[i];
517       unsigned k = out->register_index;
518       unsigned idx;
519 
520       /* Skip it, if it's an output that was never assigned a register. */
521       if (k >= v->outputs_count || v->outputs[k].regid == INVALID_REG)
522          continue;
523 
524       /* linkage map sorted by order frag shader wants things, so
525        * a bit less ideal here..
526        */
527       for (idx = 0; idx < l->cnt; idx++)
528          if (l->var[idx].slot == v->outputs[k].slot)
529             break;
530 
531       assert(idx < l->cnt);
532 
533       for (unsigned j = 0; j < out->num_components; j++) {
534          unsigned c   = j + out->start_component;
535          unsigned loc = l->var[idx].loc + c;
536          unsigned off = j + out->dst_offset;  /* in dwords */
537 
538          assert(loc < A6XX_SO_PROG_DWORDS * 2);
539          unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2;
540          if (loc & 1) {
541             prog[dword] |= A6XX_VPC_SO_PROG_B_EN |
542                            A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
543                            A6XX_VPC_SO_PROG_B_OFF(off * 4);
544          } else {
545             prog[dword] |= A6XX_VPC_SO_PROG_A_EN |
546                            A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
547                            A6XX_VPC_SO_PROG_A_OFF(off * 4);
548          }
549          BITSET_SET(valid_dwords, dword);
550       }
551    }
552 
553    unsigned prog_count = 0;
554    unsigned start, end;
555    BITSET_FOREACH_RANGE(start, end, valid_dwords,
556                         A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
557       prog_count += end - start + 1;
558    }
559 
560    const bool emit_pc_so_stream_cntl =
561       cs->device->physical_device->info->a6xx.tess_use_shared &&
562       v->type == MESA_SHADER_TESS_EVAL;
563 
564    if (emit_pc_so_stream_cntl)
565       prog_count += 1;
566 
567    tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 10 + 2 * prog_count);
568    tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
569    tu_cs_emit(cs, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written) |
570                   COND(info->stride[0] > 0,
571                        A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + info->buffer_to_stream[0])) |
572                   COND(info->stride[1] > 0,
573                        A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + info->buffer_to_stream[1])) |
574                   COND(info->stride[2] > 0,
575                        A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + info->buffer_to_stream[2])) |
576                   COND(info->stride[3] > 0,
577                        A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + info->buffer_to_stream[3])));
578    for (uint32_t i = 0; i < 4; i++) {
579       tu_cs_emit(cs, REG_A6XX_VPC_SO_BUFFER_STRIDE(i));
580       tu_cs_emit(cs, info->stride[i]);
581    }
582    bool first = true;
583    BITSET_FOREACH_RANGE(start, end, valid_dwords,
584                         A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
585       tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
586       tu_cs_emit(cs, COND(first, A6XX_VPC_SO_CNTL_RESET) |
587                      A6XX_VPC_SO_CNTL_ADDR(start));
588       for (unsigned i = start; i < end; i++) {
589          tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG);
590          tu_cs_emit(cs, prog[i]);
591       }
592       first = false;
593    }
594 
595    if (emit_pc_so_stream_cntl) {
596       /* Possibly not tess_use_shared related, but the combination of
597        * tess + xfb fails some tests if we don't emit this.
598        */
599       tu_cs_emit(cs, REG_A6XX_PC_SO_STREAM_CNTL);
600       tu_cs_emit(cs, A6XX_PC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written));
601    }
602 }
603 
604 enum tu_geom_consts_type
605 {
606    TU_CONSTS_PRIMITIVE_MAP,
607    TU_CONSTS_PRIMITIVE_PARAM,
608 };
609 
610 static void
tu6_emit_const(struct tu_cs * cs,uint32_t opcode,enum tu_geom_consts_type type,const struct ir3_const_state * const_state,unsigned constlen,enum a6xx_state_block block,uint32_t offset,uint32_t size,const uint32_t * dwords)611 tu6_emit_const(struct tu_cs *cs, uint32_t opcode, enum tu_geom_consts_type type,
612                const struct ir3_const_state *const_state,
613                unsigned constlen, enum a6xx_state_block block,
614                uint32_t offset, uint32_t size, const uint32_t *dwords) {
615    assert(size % 4 == 0);
616    dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
617 
618    if (!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
619       uint32_t base;
620       switch (type) {
621       case TU_CONSTS_PRIMITIVE_MAP:
622          base = const_state->offsets.primitive_map;
623          break;
624       case TU_CONSTS_PRIMITIVE_PARAM:
625          base = const_state->offsets.primitive_param;
626          break;
627       default:
628          unreachable("bad consts type");
629       }
630 
631       int32_t adjusted_size = MIN2(base * 4 + size, constlen * 4) - base * 4;
632       if (adjusted_size <= 0)
633          return;
634 
635       tu_cs_emit_pkt7(cs, opcode, 3 + adjusted_size);
636       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
637             CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
638             CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
639             CP_LOAD_STATE6_0_STATE_BLOCK(block) |
640             CP_LOAD_STATE6_0_NUM_UNIT(adjusted_size / 4));
641 
642       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
643       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
644 
645       tu_cs_emit_array(cs, dwords, adjusted_size);
646    } else {
647       uint32_t base;
648       switch (type) {
649       case TU_CONSTS_PRIMITIVE_MAP:
650          base = const_state->primitive_map_ubo.idx;
651          break;
652       case TU_CONSTS_PRIMITIVE_PARAM:
653          base = const_state->primitive_param_ubo.idx;
654          break;
655       default:
656          unreachable("bad consts type");
657       }
658       if (base == -1)
659          return;
660 
661       /* A7XX TODO: Emit data via sub_cs instead of NOP */
662       uint64_t iova = tu_cs_emit_data_nop(cs, dwords, size, 4);
663 
664       tu_cs_emit_pkt7(cs, opcode, 5);
665       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
666                CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
667                CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
668                CP_LOAD_STATE6_0_STATE_BLOCK(block) |
669                CP_LOAD_STATE6_0_NUM_UNIT(1));
670       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
671       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
672       int size_vec4s = DIV_ROUND_UP(size, 4);
673       tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
674    }
675 }
676 
677 static void
tu6_emit_link_map(struct tu_cs * cs,const struct ir3_shader_variant * producer,const struct ir3_shader_variant * consumer,enum a6xx_state_block sb)678 tu6_emit_link_map(struct tu_cs *cs,
679                   const struct ir3_shader_variant *producer,
680                   const struct ir3_shader_variant *consumer,
681                   enum a6xx_state_block sb)
682 {
683    const struct ir3_const_state *const_state = ir3_const_state(consumer);
684    uint32_t size = ALIGN(consumer->input_size, 4);
685 
686    if (size == 0)
687       return;
688 
689    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_MAP,
690                   const_state, consumer->constlen, sb, 0, size, producer->output_loc);
691 }
692 
693 static int
tu6_vpc_varying_mode(const struct ir3_shader_variant * fs,const struct ir3_shader_variant * last_shader,uint32_t index,uint8_t * interp_mode,uint8_t * ps_repl_mode)694 tu6_vpc_varying_mode(const struct ir3_shader_variant *fs,
695                      const struct ir3_shader_variant *last_shader,
696                      uint32_t index,
697                      uint8_t *interp_mode,
698                      uint8_t *ps_repl_mode)
699 {
700    const uint32_t compmask = fs->inputs[index].compmask;
701 
702    /* NOTE: varyings are packed, so if compmask is 0xb then first, second, and
703     * fourth component occupy three consecutive varying slots
704     */
705    int shift = 0;
706    *interp_mode = 0;
707    *ps_repl_mode = 0;
708    if (fs->inputs[index].slot == VARYING_SLOT_PNTC) {
709       if (compmask & 0x1) {
710          *ps_repl_mode |= PS_REPL_S << shift;
711          shift += 2;
712       }
713       if (compmask & 0x2) {
714          *ps_repl_mode |= PS_REPL_T << shift;
715          shift += 2;
716       }
717       if (compmask & 0x4) {
718          *interp_mode |= INTERP_ZERO << shift;
719          shift += 2;
720       }
721       if (compmask & 0x8) {
722          *interp_mode |= INTERP_ONE << 6;
723          shift += 2;
724       }
725    } else if (fs->inputs[index].slot == VARYING_SLOT_LAYER ||
726               fs->inputs[index].slot == VARYING_SLOT_VIEWPORT) {
727       /* If the last geometry shader doesn't statically write these, they're
728        * implicitly zero and the FS is supposed to read zero.
729        */
730       const gl_varying_slot slot = (gl_varying_slot) fs->inputs[index].slot;
731       if (ir3_find_output(last_shader, slot) < 0 &&
732           (compmask & 0x1)) {
733          *interp_mode |= INTERP_ZERO;
734       } else {
735          *interp_mode |= INTERP_FLAT;
736       }
737    } else if (fs->inputs[index].flat) {
738       for (int i = 0; i < 4; i++) {
739          if (compmask & (1 << i)) {
740             *interp_mode |= INTERP_FLAT << shift;
741             shift += 2;
742          }
743       }
744    }
745 
746    return util_bitcount(compmask) * 2;
747 }
748 
749 static void
tu6_emit_vpc_varying_modes(struct tu_cs * cs,const struct ir3_shader_variant * fs,const struct ir3_shader_variant * last_shader)750 tu6_emit_vpc_varying_modes(struct tu_cs *cs,
751                            const struct ir3_shader_variant *fs,
752                            const struct ir3_shader_variant *last_shader)
753 {
754    uint32_t interp_modes[8] = { 0 };
755    uint32_t ps_repl_modes[8] = { 0 };
756    uint32_t interp_regs = 0;
757 
758    if (fs) {
759       for (int i = -1;
760            (i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) {
761 
762          /* get the mode for input i */
763          uint8_t interp_mode;
764          uint8_t ps_repl_mode;
765          const int bits =
766             tu6_vpc_varying_mode(fs, last_shader, i, &interp_mode, &ps_repl_mode);
767 
768          /* OR the mode into the array */
769          const uint32_t inloc = fs->inputs[i].inloc * 2;
770          uint32_t n = inloc / 32;
771          uint32_t shift = inloc % 32;
772          interp_modes[n] |= interp_mode << shift;
773          ps_repl_modes[n] |= ps_repl_mode << shift;
774          if (shift + bits > 32) {
775             n++;
776             shift = 32 - shift;
777 
778             interp_modes[n] |= interp_mode >> shift;
779             ps_repl_modes[n] |= ps_repl_mode >> shift;
780          }
781          interp_regs = MAX2(interp_regs, n + 1);
782       }
783    }
784 
785    if (interp_regs) {
786       tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), interp_regs);
787       tu_cs_emit_array(cs, interp_modes, interp_regs);
788 
789       tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), interp_regs);
790       tu_cs_emit_array(cs, ps_repl_modes, interp_regs);
791    }
792 }
793 
794 template <chip CHIP>
795 void
tu6_emit_vpc(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,const struct ir3_shader_variant * fs)796 tu6_emit_vpc(struct tu_cs *cs,
797              const struct ir3_shader_variant *vs,
798              const struct ir3_shader_variant *hs,
799              const struct ir3_shader_variant *ds,
800              const struct ir3_shader_variant *gs,
801              const struct ir3_shader_variant *fs)
802 {
803    /* note: doesn't compile as static because of the array regs.. */
804    const struct reg_config {
805       uint16_t reg_sp_xs_out_reg;
806       uint16_t reg_sp_xs_vpc_dst_reg;
807       uint16_t reg_vpc_xs_pack;
808       uint16_t reg_vpc_xs_clip_cntl;
809       uint16_t reg_vpc_xs_clip_cntl_v2;
810       uint16_t reg_gras_xs_cl_cntl;
811       uint16_t reg_pc_xs_out_cntl;
812       uint16_t reg_sp_xs_primitive_cntl;
813       uint16_t reg_vpc_xs_layer_cntl;
814       uint16_t reg_vpc_xs_layer_cntl_v2;
815       uint16_t reg_gras_xs_layer_cntl;
816    } reg_config[] = {
817       [MESA_SHADER_VERTEX] = {
818          REG_A6XX_SP_VS_OUT_REG(0),
819          REG_A6XX_SP_VS_VPC_DST_REG(0),
820          REG_A6XX_VPC_VS_PACK,
821          REG_A6XX_VPC_VS_CLIP_CNTL,
822          REG_A6XX_VPC_VS_CLIP_CNTL_V2,
823          REG_A6XX_GRAS_VS_CL_CNTL,
824          REG_A6XX_PC_VS_OUT_CNTL,
825          REG_A6XX_SP_VS_PRIMITIVE_CNTL,
826          REG_A6XX_VPC_VS_LAYER_CNTL,
827          REG_A6XX_VPC_VS_LAYER_CNTL_V2,
828          REG_A6XX_GRAS_VS_LAYER_CNTL
829       },
830       [MESA_SHADER_TESS_CTRL] = {
831          0,
832          0,
833          0,
834          0,
835          0,
836          0,
837          REG_A6XX_PC_HS_OUT_CNTL,
838          0,
839          0,
840          0
841       },
842       [MESA_SHADER_TESS_EVAL] = {
843          REG_A6XX_SP_DS_OUT_REG(0),
844          REG_A6XX_SP_DS_VPC_DST_REG(0),
845          REG_A6XX_VPC_DS_PACK,
846          REG_A6XX_VPC_DS_CLIP_CNTL,
847          REG_A6XX_VPC_DS_CLIP_CNTL_V2,
848          REG_A6XX_GRAS_DS_CL_CNTL,
849          REG_A6XX_PC_DS_OUT_CNTL,
850          REG_A6XX_SP_DS_PRIMITIVE_CNTL,
851          REG_A6XX_VPC_DS_LAYER_CNTL,
852          REG_A6XX_VPC_DS_LAYER_CNTL_V2,
853          REG_A6XX_GRAS_DS_LAYER_CNTL
854       },
855       [MESA_SHADER_GEOMETRY] = {
856          REG_A6XX_SP_GS_OUT_REG(0),
857          REG_A6XX_SP_GS_VPC_DST_REG(0),
858          REG_A6XX_VPC_GS_PACK,
859          REG_A6XX_VPC_GS_CLIP_CNTL,
860          REG_A6XX_VPC_GS_CLIP_CNTL_V2,
861          REG_A6XX_GRAS_GS_CL_CNTL,
862          REG_A6XX_PC_GS_OUT_CNTL,
863          REG_A6XX_SP_GS_PRIMITIVE_CNTL,
864          REG_A6XX_VPC_GS_LAYER_CNTL,
865          REG_A6XX_VPC_GS_LAYER_CNTL_V2,
866          REG_A6XX_GRAS_GS_LAYER_CNTL
867       },
868    };
869 
870    const struct ir3_shader_variant *last_shader;
871    if (gs) {
872       last_shader = gs;
873    } else if (hs) {
874       last_shader = ds;
875    } else {
876       last_shader = vs;
877    }
878 
879    const struct reg_config *cfg = &reg_config[last_shader->type];
880 
881    struct ir3_shader_linkage linkage = {
882       .primid_loc = 0xff,
883       .clip0_loc = 0xff,
884       .clip1_loc = 0xff,
885    };
886    if (fs)
887       ir3_link_shaders(&linkage, last_shader, fs, true);
888 
889    if (last_shader->stream_output.num_outputs)
890       ir3_link_stream_out(&linkage, last_shader);
891 
892    /* a6xx finds position/pointsize at the end */
893    const uint32_t pointsize_regid =
894       ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
895    const uint32_t layer_regid =
896       ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
897    const uint32_t view_regid =
898       ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
899    const uint32_t clip0_regid =
900       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
901    const uint32_t clip1_regid =
902       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
903    uint32_t flags_regid = gs ?
904       ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0;
905 
906    uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff;
907 
908    if (layer_regid != regid(63, 0)) {
909       layer_loc = linkage.max_loc;
910       ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc);
911    }
912 
913    if (view_regid != regid(63, 0)) {
914       view_loc = linkage.max_loc;
915       ir3_link_add(&linkage, VARYING_SLOT_VIEWPORT, view_regid, 0x1, linkage.max_loc);
916    }
917 
918    unsigned extra_pos = 0;
919 
920    for (unsigned i = 0; i < last_shader->outputs_count; i++) {
921       if (last_shader->outputs[i].slot != VARYING_SLOT_POS)
922          continue;
923 
924       if (position_loc == 0xff)
925          position_loc = linkage.max_loc;
926 
927       ir3_link_add(&linkage, last_shader->outputs[i].slot,
928                    last_shader->outputs[i].regid,
929                    0xf, position_loc + 4 * last_shader->outputs[i].view);
930       extra_pos = MAX2(extra_pos, last_shader->outputs[i].view);
931    }
932 
933    if (pointsize_regid != regid(63, 0)) {
934       pointsize_loc = linkage.max_loc;
935       ir3_link_add(&linkage, VARYING_SLOT_PSIZ, pointsize_regid, 0x1, linkage.max_loc);
936    }
937 
938    uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask;
939 
940    /* Handle the case where clip/cull distances aren't read by the FS */
941    uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
942    if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
943       clip0_loc = linkage.max_loc;
944       ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST0, clip0_regid,
945                    clip_cull_mask & 0xf, linkage.max_loc);
946    }
947    if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
948       clip1_loc = linkage.max_loc;
949       ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST1, clip1_regid,
950                    clip_cull_mask >> 4, linkage.max_loc);
951    }
952 
953    tu6_setup_streamout<CHIP>(cs, last_shader, &linkage);
954 
955    /* There is a hardware bug on a750 where STRIDE_IN_VPC of 5 to 8 in GS with
956     * an input primitive type with adjacency, an output primitive type of
957     * points, and a high enough vertex count causes a hang.
958     */
959    if (cs->device->physical_device->info->a7xx.gs_vpc_adjacency_quirk &&
960        gs && gs->gs.output_primitive == MESA_PRIM_POINTS &&
961        linkage.max_loc > 4) {
962       linkage.max_loc = MAX2(linkage.max_loc, 9);
963    }
964 
965    /* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
966     * at least when a DS is the last stage, so add a dummy output to keep it
967     * happy if there aren't any. We do this late in order to avoid emitting
968     * any unused code and make sure that optimizations don't remove it.
969     */
970    if (linkage.cnt == 0)
971       ir3_link_add(&linkage, 0, 0, 0x1, linkage.max_loc);
972 
973    /* map outputs of the last shader to VPC */
974    assert(linkage.cnt <= 32);
975    const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
976    const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
977    uint32_t sp_out[16] = {0};
978    uint32_t sp_vpc_dst[8] = {0};
979    for (uint32_t i = 0; i < linkage.cnt; i++) {
980       ((uint16_t *) sp_out)[i] =
981          A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) |
982          A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask);
983       ((uint8_t *) sp_vpc_dst)[i] =
984          A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc);
985    }
986 
987    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count);
988    tu_cs_emit_array(cs, sp_out, sp_out_count);
989 
990    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count);
991    tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count);
992 
993    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1);
994    tu_cs_emit(cs, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) |
995                   A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) |
996                   A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc) |
997                   A6XX_VPC_VS_PACK_EXTRAPOS(extra_pos));
998 
999    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1);
1000    tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
1001                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
1002                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
1003    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl_v2, 1);
1004    tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
1005                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
1006                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
1007 
1008    tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1);
1009    tu_cs_emit(cs, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(last_shader->clip_mask) |
1010                   A6XX_GRAS_VS_CL_CNTL_CULL_MASK(last_shader->cull_mask));
1011 
1012    const struct ir3_shader_variant *geom_shaders[] = { vs, hs, ds, gs };
1013 
1014    for (unsigned i = 0; i < ARRAY_SIZE(geom_shaders); i++) {
1015       const struct ir3_shader_variant *shader = geom_shaders[i];
1016       if (!shader)
1017          continue;
1018 
1019       bool primid = shader->type != MESA_SHADER_VERTEX &&
1020          VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID));
1021 
1022       tu_cs_emit_pkt4(cs, reg_config[shader->type].reg_pc_xs_out_cntl, 1);
1023       if (shader == last_shader) {
1024          tu_cs_emit(cs, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
1025                         CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
1026                         CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) |
1027                         CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) |
1028                         COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) |
1029                         A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
1030       } else {
1031          tu_cs_emit(cs, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID));
1032       }
1033    }
1034 
1035    /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */
1036    if (gs)
1037       assert(flags_regid != INVALID_REG);
1038 
1039    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1);
1040    tu_cs_emit(cs, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) |
1041                   A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid));
1042 
1043    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1);
1044    tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
1045                   A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc) |
1046                   0xff0000);
1047    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl_v2, 1);
1048    tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
1049                   A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc) |
1050                   0xff0000);
1051 
1052    tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1);
1053    tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) |
1054                   CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW));
1055 
1056    tu6_emit_vpc_varying_modes(cs, fs, last_shader);
1057 }
1058 TU_GENX(tu6_emit_vpc);
1059 
1060 static void
tu6_emit_vs_params(struct tu_cs * cs,const struct ir3_const_state * const_state,unsigned constlen,unsigned param_stride,unsigned num_vertices)1061 tu6_emit_vs_params(struct tu_cs *cs,
1062                    const struct ir3_const_state *const_state,
1063                    unsigned constlen,
1064                    unsigned param_stride,
1065                    unsigned num_vertices)
1066 {
1067    uint32_t vs_params[4] = {
1068       param_stride * num_vertices * 4,  /* vs primitive stride */
1069       param_stride * 4,                 /* vs vertex stride */
1070       0,
1071       0,
1072    };
1073    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1074                   const_state, constlen, SB6_VS_SHADER, 0,
1075                   ARRAY_SIZE(vs_params), vs_params);
1076 }
1077 
1078 static void
tu_get_tess_iova(struct tu_device * dev,uint64_t * tess_factor_iova,uint64_t * tess_param_iova)1079 tu_get_tess_iova(struct tu_device *dev,
1080                  uint64_t *tess_factor_iova,
1081                  uint64_t *tess_param_iova)
1082 {
1083    /* Create the shared tess factor BO the first time tess is used on the device. */
1084    if (!dev->tess_bo) {
1085       mtx_lock(&dev->mutex);
1086       if (!dev->tess_bo) {
1087          tu_bo_init_new(dev, NULL, &dev->tess_bo, TU_TESS_BO_SIZE,
1088                         TU_BO_ALLOC_INTERNAL_RESOURCE, "tess");
1089       }
1090       mtx_unlock(&dev->mutex);
1091    }
1092 
1093    *tess_factor_iova = dev->tess_bo->iova;
1094    *tess_param_iova = dev->tess_bo->iova + TU_TESS_FACTOR_SIZE;
1095 }
1096 
1097 static const enum mesa_vk_dynamic_graphics_state tu_patch_control_points_state[] = {
1098    MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS,
1099 };
1100 
1101 #define HS_PARAMS_SIZE 8
1102 
1103 template <chip CHIP>
1104 static unsigned
tu6_patch_control_points_size(struct tu_device * dev,const struct tu_shader * vs,const struct tu_shader * tcs,const struct tu_shader * tes,const struct tu_program_state * program,uint32_t patch_control_points)1105 tu6_patch_control_points_size(struct tu_device *dev,
1106                               const struct tu_shader *vs,
1107                               const struct tu_shader *tcs,
1108                               const struct tu_shader *tes,
1109                               const struct tu_program_state *program,
1110                               uint32_t patch_control_points)
1111 {
1112    if (dev->physical_device->info->a7xx.load_shader_consts_via_preamble) {
1113 #define EMIT_CONST_DWORDS(const_dwords) (6 + const_dwords + 4)
1114       return EMIT_CONST_DWORDS(4) +
1115          EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2;
1116 #undef EMIT_CONST_DWORDS
1117    } else {
1118 #define EMIT_CONST_DWORDS(const_dwords) (4 + const_dwords)
1119       return EMIT_CONST_DWORDS(4) +
1120          EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2;
1121 #undef EMIT_CONST_DWORDS
1122    }
1123 }
1124 
1125 template <chip CHIP>
1126 void
tu6_emit_patch_control_points(struct tu_cs * cs,const struct tu_shader * vs,const struct tu_shader * tcs,const struct tu_shader * tes,const struct tu_program_state * program,uint32_t patch_control_points)1127 tu6_emit_patch_control_points(struct tu_cs *cs,
1128                               const struct tu_shader *vs,
1129                               const struct tu_shader *tcs,
1130                               const struct tu_shader *tes,
1131                               const struct tu_program_state *program,
1132                               uint32_t patch_control_points)
1133 {
1134    if (!tcs->variant)
1135       return;
1136 
1137    struct tu_device *dev = cs->device;
1138 
1139    tu6_emit_vs_params(cs,
1140                       &program->link[MESA_SHADER_VERTEX].const_state,
1141                       program->link[MESA_SHADER_VERTEX].constlen,
1142                       vs->variant->output_size,
1143                       patch_control_points);
1144 
1145    uint64_t tess_factor_iova, tess_param_iova;
1146    tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
1147 
1148    uint32_t hs_params[HS_PARAMS_SIZE] = {
1149       vs->variant->output_size * patch_control_points * 4,  /* hs primitive stride */
1150       vs->variant->output_size * 4,                         /* hs vertex stride */
1151       tcs->variant->output_size,
1152       patch_control_points,
1153       tess_param_iova,
1154       tess_param_iova >> 32,
1155       tess_factor_iova,
1156       tess_factor_iova >> 32,
1157    };
1158 
1159    const struct ir3_const_state *hs_const =
1160       &program->link[MESA_SHADER_TESS_CTRL].const_state;
1161    unsigned hs_constlen = program->link[MESA_SHADER_TESS_CTRL].constlen;
1162    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1163                   hs_const, hs_constlen, SB6_HS_SHADER, 0,
1164                   ARRAY_SIZE(hs_params), hs_params);
1165 
1166    uint32_t patch_local_mem_size_16b =
1167       patch_control_points * vs->variant->output_size / 4;
1168 
1169    /* Total attribute slots in HS incoming patch. */
1170    tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_INPUT_SIZE, 1);
1171    tu_cs_emit(cs, patch_local_mem_size_16b);
1172 
1173    const uint32_t wavesize = 64;
1174    const uint32_t vs_hs_local_mem_size = 16384;
1175 
1176    uint32_t max_patches_per_wave;
1177    if (dev->physical_device->info->a6xx.tess_use_shared) {
1178       /* HS invocations for a patch are always within the same wave,
1179        * making barriers less expensive. VS can't have barriers so we
1180        * don't care about VS invocations being in the same wave.
1181        */
1182       max_patches_per_wave = wavesize / tcs->variant->tess.tcs_vertices_out;
1183    } else {
1184       /* VS is also in the same wave */
1185       max_patches_per_wave =
1186          wavesize / MAX2(patch_control_points,
1187                          tcs->variant->tess.tcs_vertices_out);
1188    }
1189 
1190    uint32_t patches_per_wave =
1191       MIN2(vs_hs_local_mem_size / (patch_local_mem_size_16b * 16),
1192            max_patches_per_wave);
1193 
1194    uint32_t wave_input_size = DIV_ROUND_UP(
1195       patches_per_wave * patch_local_mem_size_16b * 16, 256);
1196 
1197    tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1198    tu_cs_emit(cs, wave_input_size);
1199 
1200    /* maximum number of patches that can fit in tess factor/param buffers */
1201    uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(tes->variant->key.tessellation),
1202                         TU_TESS_PARAM_SIZE / (tcs->variant->output_size * 4));
1203    /* convert from # of patches to draw count */
1204    subdraw_size *= patch_control_points;
1205 
1206    tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1);
1207    tu_cs_emit(cs, subdraw_size);
1208 }
1209 
1210 static void
tu6_emit_geom_tess_consts(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs)1211 tu6_emit_geom_tess_consts(struct tu_cs *cs,
1212                           const struct ir3_shader_variant *vs,
1213                           const struct ir3_shader_variant *hs,
1214                           const struct ir3_shader_variant *ds,
1215                           const struct ir3_shader_variant *gs)
1216 {
1217    struct tu_device *dev = cs->device;
1218 
1219    if (gs && !hs) {
1220       tu6_emit_vs_params(cs, ir3_const_state(vs), vs->constlen,
1221                          vs->output_size, gs->gs.vertices_in);
1222    }
1223 
1224    if (hs) {
1225       uint64_t tess_factor_iova, tess_param_iova;
1226       tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
1227 
1228       uint32_t ds_params[8] = {
1229          gs ? ds->output_size * gs->gs.vertices_in * 4 : 0,  /* ds primitive stride */
1230          ds->output_size * 4,                                /* ds vertex stride */
1231          hs->output_size,                                    /* hs vertex stride (dwords) */
1232          hs->tess.tcs_vertices_out,
1233          tess_param_iova,
1234          tess_param_iova >> 32,
1235          tess_factor_iova,
1236          tess_factor_iova >> 32,
1237       };
1238 
1239       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1240                      ds->const_state, ds->constlen, SB6_DS_SHADER, 0,
1241                      ARRAY_SIZE(ds_params), ds_params);
1242    }
1243 
1244    if (gs) {
1245       const struct ir3_shader_variant *prev = ds ? ds : vs;
1246       uint32_t gs_params[4] = {
1247          prev->output_size * gs->gs.vertices_in * 4,  /* gs primitive stride */
1248          prev->output_size * 4,                 /* gs vertex stride */
1249          0,
1250          0,
1251       };
1252       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1253                      gs->const_state, gs->constlen, SB6_GS_SHADER, 0,
1254                      ARRAY_SIZE(gs_params), gs_params);
1255    }
1256 }
1257 
1258 template <chip CHIP>
1259 static void
tu6_emit_program_config(struct tu_cs * cs,const struct tu_program_state * prog,struct tu_shader ** shaders,const struct ir3_shader_variant ** variants)1260 tu6_emit_program_config(struct tu_cs *cs,
1261                         const struct tu_program_state *prog,
1262                         struct tu_shader **shaders,
1263                         const struct ir3_shader_variant **variants)
1264 {
1265    STATIC_ASSERT(MESA_SHADER_VERTEX == 0);
1266 
1267    bool shared_consts_enable =
1268       prog->shared_consts.type == IR3_PUSH_CONSTS_SHARED;
1269    tu6_emit_shared_consts_enable<CHIP>(cs, shared_consts_enable);
1270 
1271    tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
1272          .vs_state = true,
1273          .hs_state = true,
1274          .ds_state = true,
1275          .gs_state = true,
1276          .fs_state = true,
1277          .gfx_ibo = true,
1278          .gfx_shared_const = shared_consts_enable));
1279    for (size_t stage_idx = MESA_SHADER_VERTEX;
1280         stage_idx <= MESA_SHADER_FRAGMENT; stage_idx++) {
1281       gl_shader_stage stage = (gl_shader_stage) stage_idx;
1282       tu6_emit_xs_config<CHIP>(cs, stage, variants[stage]);
1283    }
1284 
1285    for (size_t stage_idx = MESA_SHADER_VERTEX;
1286         stage_idx <= MESA_SHADER_FRAGMENT; stage_idx++) {
1287       gl_shader_stage stage = (gl_shader_stage) stage_idx;
1288       tu6_emit_dynamic_offset(cs, variants[stage], shaders[stage], prog);
1289    }
1290 
1291    const struct ir3_shader_variant *vs = variants[MESA_SHADER_VERTEX];
1292    const struct ir3_shader_variant *hs = variants[MESA_SHADER_TESS_CTRL];
1293    const struct ir3_shader_variant *ds = variants[MESA_SHADER_TESS_EVAL];
1294    const struct ir3_shader_variant *gs = variants[MESA_SHADER_GEOMETRY];
1295 
1296    if (hs) {
1297       tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER);
1298       tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER);
1299    }
1300 
1301    if (gs) {
1302       if (hs) {
1303          tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER);
1304       } else {
1305          tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER);
1306       }
1307 
1308       uint32_t prev_stage_output_size = ds ? ds->output_size : vs->output_size;
1309 
1310       if (CHIP == A6XX) {
1311          /* Size of per-primitive alloction in ldlw memory in vec4s. */
1312          uint32_t vec4_size = gs->gs.vertices_in *
1313                               DIV_ROUND_UP(prev_stage_output_size, 4);
1314 
1315          tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
1316          tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
1317       }
1318 
1319       uint32_t prim_size = prev_stage_output_size;
1320       if (prim_size > 64)
1321          prim_size = 64;
1322       else if (prim_size == 64)
1323          prim_size = 63;
1324       tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_PRIM_SIZE, 1);
1325       tu_cs_emit(cs, prim_size);
1326    }
1327 
1328    if (gs || hs) {
1329       tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs);
1330    }
1331 }
1332 
1333 static bool
contains_all_shader_state(VkGraphicsPipelineLibraryFlagsEXT state)1334 contains_all_shader_state(VkGraphicsPipelineLibraryFlagsEXT state)
1335 {
1336    return (state &
1337       (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
1338        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) ==
1339       (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
1340        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT);
1341 }
1342 
1343 static bool
pipeline_contains_all_shader_state(struct tu_pipeline * pipeline)1344 pipeline_contains_all_shader_state(struct tu_pipeline *pipeline)
1345 {
1346    return pipeline->type == TU_PIPELINE_GRAPHICS ||
1347       pipeline->type == TU_PIPELINE_COMPUTE ||
1348       contains_all_shader_state(tu_pipeline_to_graphics_lib(pipeline)->state);
1349 }
1350 
1351 /* Return true if this pipeline contains all of the GPL stages listed but none
1352  * of the libraries it uses do, so this is "the first time" that all of them
1353  * are defined together. This is useful for state that needs to be combined
1354  * from multiple GPL stages.
1355  */
1356 
1357 static bool
set_combined_state(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline,VkGraphicsPipelineLibraryFlagsEXT state)1358 set_combined_state(struct tu_pipeline_builder *builder,
1359                    struct tu_pipeline *pipeline,
1360                    VkGraphicsPipelineLibraryFlagsEXT state)
1361 {
1362    if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB &&
1363        (tu_pipeline_to_graphics_lib(pipeline)->state & state) != state)
1364       return false;
1365 
1366    for (unsigned i = 0; i < builder->num_libraries; i++) {
1367       if ((builder->libraries[i]->state & state) == state)
1368          return false;
1369    }
1370 
1371    return true;
1372 }
1373 
1374 #define TU6_EMIT_VERTEX_INPUT_MAX_DWORDS (MAX_VERTEX_ATTRIBS * 2 + 1)
1375 
1376 static VkResult
tu_pipeline_allocate_cs(struct tu_device * dev,struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout,struct tu_pipeline_builder * builder,const struct ir3_shader_variant * compute)1377 tu_pipeline_allocate_cs(struct tu_device *dev,
1378                         struct tu_pipeline *pipeline,
1379                         struct tu_pipeline_layout *layout,
1380                         struct tu_pipeline_builder *builder,
1381                         const struct ir3_shader_variant *compute)
1382 {
1383    uint32_t size = 1024;
1384 
1385    /* graphics case: */
1386    if (builder) {
1387       if (builder->state &
1388           VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) {
1389          size += TU6_EMIT_VERTEX_INPUT_MAX_DWORDS;
1390       }
1391 
1392       if (set_combined_state(builder, pipeline,
1393                              VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
1394                              VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
1395          size += tu6_load_state_size(pipeline, layout);
1396       }
1397    } else {
1398       size += tu6_load_state_size(pipeline, layout);
1399    }
1400 
1401    /* Allocate the space for the pipeline out of the device's RO suballocator.
1402     *
1403     * Sub-allocating BOs saves memory and also kernel overhead in refcounting of
1404     * BOs at exec time.
1405     *
1406     * The pipeline cache would seem like a natural place to stick the
1407     * suballocator, except that it is not guaranteed to outlive the pipelines
1408     * created from it, so you can't store any long-lived state there, and you
1409     * can't use its EXTERNALLY_SYNCHRONIZED flag to avoid atomics because
1410     * pipeline destroy isn't synchronized by the cache.
1411     */
1412    mtx_lock(&dev->pipeline_mutex);
1413    VkResult result = tu_suballoc_bo_alloc(&pipeline->bo, &dev->pipeline_suballoc,
1414                                           size * 4, 128);
1415    mtx_unlock(&dev->pipeline_mutex);
1416    if (result != VK_SUCCESS)
1417       return result;
1418 
1419    TU_RMV(cmd_buffer_suballoc_bo_create, dev, &pipeline->bo);
1420    tu_cs_init_suballoc(&pipeline->cs, dev, &pipeline->bo);
1421 
1422    return VK_SUCCESS;
1423 }
1424 
1425 static void
tu_append_executable(struct tu_pipeline * pipeline,const struct ir3_shader_variant * variant,char * nir_from_spirv)1426 tu_append_executable(struct tu_pipeline *pipeline,
1427                      const struct ir3_shader_variant *variant,
1428                      char *nir_from_spirv)
1429 {
1430    struct tu_pipeline_executable exe = {
1431       .stage = variant->type,
1432       .stats = variant->info,
1433       .is_binning = variant->binning_pass,
1434       .nir_from_spirv = nir_from_spirv,
1435       .nir_final = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.nir),
1436       .disasm = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.disasm),
1437    };
1438 
1439    util_dynarray_append(&pipeline->executables, struct tu_pipeline_executable, exe);
1440 }
1441 
1442 static void
tu_hash_stage(struct mesa_sha1 * ctx,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo * stage,const nir_shader * nir,const struct tu_shader_key * key)1443 tu_hash_stage(struct mesa_sha1 *ctx,
1444               VkPipelineCreateFlags2KHR pipeline_flags,
1445               const VkPipelineShaderStageCreateInfo *stage,
1446               const nir_shader *nir,
1447               const struct tu_shader_key *key)
1448 {
1449 
1450    if (nir) {
1451       struct blob blob;
1452       blob_init(&blob);
1453       nir_serialize(&blob, nir, true);
1454       _mesa_sha1_update(ctx, blob.data, blob.size);
1455       blob_finish(&blob);
1456    } else {
1457       unsigned char stage_hash[SHA1_DIGEST_LENGTH];
1458       vk_pipeline_hash_shader_stage(pipeline_flags, stage, NULL, stage_hash);
1459       _mesa_sha1_update(ctx, stage_hash, sizeof(stage_hash));
1460    }
1461    _mesa_sha1_update(ctx, key, sizeof(*key));
1462 }
1463 
1464 /* Hash flags which can affect ir3 shader compilation which aren't known until
1465  * logical device creation.
1466  */
1467 static void
tu_hash_compiler(struct mesa_sha1 * ctx,const struct ir3_compiler * compiler)1468 tu_hash_compiler(struct mesa_sha1 *ctx, const struct ir3_compiler *compiler)
1469 {
1470    _mesa_sha1_update(ctx, &compiler->options.robust_buffer_access2,
1471                      sizeof(compiler->options.robust_buffer_access2));
1472    _mesa_sha1_update(ctx, &ir3_shader_debug, sizeof(ir3_shader_debug));
1473 }
1474 
1475 static void
tu_hash_shaders(unsigned char * hash,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo ** stages,nir_shader * const * nir,const struct tu_pipeline_layout * layout,const struct tu_shader_key * keys,VkGraphicsPipelineLibraryFlagsEXT state,const struct ir3_compiler * compiler)1476 tu_hash_shaders(unsigned char *hash,
1477                 VkPipelineCreateFlags2KHR pipeline_flags,
1478                 const VkPipelineShaderStageCreateInfo **stages,
1479                 nir_shader *const *nir,
1480                 const struct tu_pipeline_layout *layout,
1481                 const struct tu_shader_key *keys,
1482                 VkGraphicsPipelineLibraryFlagsEXT state,
1483                 const struct ir3_compiler *compiler)
1484 {
1485    struct mesa_sha1 ctx;
1486 
1487    _mesa_sha1_init(&ctx);
1488 
1489    if (layout)
1490       _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
1491 
1492    for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
1493       if (stages[i] || nir[i]) {
1494          tu_hash_stage(&ctx, pipeline_flags, stages[i], nir[i], &keys[i]);
1495       }
1496    }
1497    _mesa_sha1_update(&ctx, &state, sizeof(state));
1498    tu_hash_compiler(&ctx, compiler);
1499    _mesa_sha1_final(&ctx, hash);
1500 }
1501 
1502 static void
tu_hash_compute(unsigned char * hash,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo * stage,const struct tu_pipeline_layout * layout,const struct tu_shader_key * key,const struct ir3_compiler * compiler)1503 tu_hash_compute(unsigned char *hash,
1504                 VkPipelineCreateFlags2KHR pipeline_flags,
1505                 const VkPipelineShaderStageCreateInfo *stage,
1506                 const struct tu_pipeline_layout *layout,
1507                 const struct tu_shader_key *key,
1508                 const struct ir3_compiler *compiler)
1509 {
1510    struct mesa_sha1 ctx;
1511 
1512    _mesa_sha1_init(&ctx);
1513 
1514    if (layout)
1515       _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
1516 
1517    tu_hash_stage(&ctx, pipeline_flags, stage, NULL, key);
1518 
1519    tu_hash_compiler(&ctx, compiler);
1520    _mesa_sha1_final(&ctx, hash);
1521 }
1522 
1523 static struct tu_shader *
tu_pipeline_cache_lookup(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,bool * application_cache_hit)1524 tu_pipeline_cache_lookup(struct vk_pipeline_cache *cache,
1525                          const void *key_data, size_t key_size,
1526                          bool *application_cache_hit)
1527 {
1528    struct vk_pipeline_cache_object *object =
1529       vk_pipeline_cache_lookup_object(cache, key_data, key_size,
1530                                       &tu_shader_ops, application_cache_hit);
1531    if (object)
1532       return container_of(object, struct tu_shader, base);
1533    else
1534       return NULL;
1535 }
1536 
1537 static struct tu_shader *
tu_pipeline_cache_insert(struct vk_pipeline_cache * cache,struct tu_shader * shader)1538 tu_pipeline_cache_insert(struct vk_pipeline_cache *cache,
1539                          struct tu_shader *shader)
1540 {
1541    struct vk_pipeline_cache_object *object =
1542       vk_pipeline_cache_add_object(cache, &shader->base);
1543    return container_of(object, struct tu_shader, base);
1544 }
1545 
1546 static bool
1547 tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
1548                          struct blob *blob);
1549 
1550 static struct vk_pipeline_cache_object *
1551 tu_nir_shaders_deserialize(struct vk_pipeline_cache *cache,
1552                            const void *key_data,
1553                            size_t key_size,
1554                            struct blob_reader *blob);
1555 
1556 static void
tu_nir_shaders_destroy(struct vk_device * device,struct vk_pipeline_cache_object * object)1557 tu_nir_shaders_destroy(struct vk_device *device,
1558                        struct vk_pipeline_cache_object *object)
1559 {
1560    struct tu_nir_shaders *shaders =
1561       container_of(object, struct tu_nir_shaders, base);
1562 
1563    for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++)
1564       ralloc_free(shaders->nir[i]);
1565 
1566    vk_pipeline_cache_object_finish(&shaders->base);
1567    vk_free(&device->alloc, shaders);
1568 }
1569 
1570 const struct vk_pipeline_cache_object_ops tu_nir_shaders_ops = {
1571    .serialize = tu_nir_shaders_serialize,
1572    .deserialize = tu_nir_shaders_deserialize,
1573    .destroy = tu_nir_shaders_destroy,
1574 };
1575 
1576 static struct tu_nir_shaders *
tu_nir_shaders_init(struct tu_device * dev,const void * key_data,size_t key_size)1577 tu_nir_shaders_init(struct tu_device *dev, const void *key_data, size_t key_size)
1578 {
1579    VK_MULTIALLOC(ma);
1580    VK_MULTIALLOC_DECL(&ma, struct tu_nir_shaders, shaders, 1);
1581    VK_MULTIALLOC_DECL_SIZE(&ma, char, obj_key_data, key_size);
1582 
1583    if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
1584                              VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
1585       return NULL;
1586 
1587    memcpy(obj_key_data, key_data, key_size);
1588    vk_pipeline_cache_object_init(&dev->vk, &shaders->base,
1589                                  &tu_nir_shaders_ops, obj_key_data, key_size);
1590 
1591    return shaders;
1592 }
1593 
1594 static bool
tu_nir_shaders_serialize(struct vk_pipeline_cache_object * object,struct blob * blob)1595 tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
1596                          struct blob *blob)
1597 {
1598    struct tu_nir_shaders *shaders =
1599       container_of(object, struct tu_nir_shaders, base);
1600 
1601    for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
1602       if (shaders->nir[i]) {
1603          blob_write_uint8(blob, 1);
1604          nir_serialize(blob, shaders->nir[i], true);
1605       } else {
1606          blob_write_uint8(blob, 0);
1607       }
1608    }
1609 
1610    return true;
1611 }
1612 
1613 static struct vk_pipeline_cache_object *
tu_nir_shaders_deserialize(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,struct blob_reader * blob)1614 tu_nir_shaders_deserialize(struct vk_pipeline_cache *cache,
1615                            const void *key_data,
1616                            size_t key_size,
1617                            struct blob_reader *blob)
1618 {
1619    struct tu_device *dev =
1620       container_of(cache->base.device, struct tu_device, vk);
1621    struct tu_nir_shaders *shaders =
1622       tu_nir_shaders_init(dev, key_data, key_size);
1623 
1624    if (!shaders)
1625       return NULL;
1626 
1627    for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
1628       if (blob_read_uint8(blob)) {
1629          shaders->nir[i] =
1630             nir_deserialize(NULL, ir3_get_compiler_options(dev->compiler), blob);
1631       }
1632    }
1633 
1634    return &shaders->base;
1635 }
1636 
1637 static struct tu_nir_shaders *
tu_nir_cache_lookup(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,bool * application_cache_hit)1638 tu_nir_cache_lookup(struct vk_pipeline_cache *cache,
1639                     const void *key_data, size_t key_size,
1640                     bool *application_cache_hit)
1641 {
1642    struct vk_pipeline_cache_object *object =
1643       vk_pipeline_cache_lookup_object(cache, key_data, key_size,
1644                                       &tu_nir_shaders_ops, application_cache_hit);
1645    if (object)
1646       return container_of(object, struct tu_nir_shaders, base);
1647    else
1648       return NULL;
1649 }
1650 
1651 static struct tu_nir_shaders *
tu_nir_cache_insert(struct vk_pipeline_cache * cache,struct tu_nir_shaders * shaders)1652 tu_nir_cache_insert(struct vk_pipeline_cache *cache,
1653                     struct tu_nir_shaders *shaders)
1654 {
1655    struct vk_pipeline_cache_object *object =
1656       vk_pipeline_cache_add_object(cache, &shaders->base);
1657    return container_of(object, struct tu_nir_shaders, base);
1658 }
1659 
1660 static VkResult
tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)1661 tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
1662                                     struct tu_pipeline *pipeline)
1663 {
1664    VkResult result = VK_SUCCESS;
1665    const struct ir3_compiler *compiler = builder->device->compiler;
1666    const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = {
1667       NULL
1668    };
1669    VkPipelineCreationFeedback pipeline_feedback = {
1670       .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
1671    };
1672    VkPipelineCreationFeedback stage_feedbacks[MESA_SHADER_STAGES] = { 0 };
1673 
1674    const bool executable_info =
1675       builder->create_flags &
1676       VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
1677 
1678    bool retain_nir =
1679       builder->create_flags &
1680       VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT;
1681 
1682    int64_t pipeline_start = os_time_get_nano();
1683 
1684    const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
1685       vk_find_struct_const(builder->create_info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
1686 
1687    bool must_compile = false;
1688    for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
1689       if (!(builder->active_stages & builder->create_info->pStages[i].stage))
1690          continue;
1691 
1692       gl_shader_stage stage =
1693          vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
1694       stage_infos[stage] = &builder->create_info->pStages[i];
1695       must_compile = true;
1696    }
1697 
1698    /* Forward declare everything due to the goto usage */
1699    nir_shader *nir[ARRAY_SIZE(stage_infos)] = { NULL };
1700    struct tu_shader *shaders[ARRAY_SIZE(stage_infos)] = { NULL };
1701    nir_shader *post_link_nir[ARRAY_SIZE(nir)] = { NULL };
1702    char *nir_initial_disasm[ARRAY_SIZE(stage_infos)] = { NULL };
1703    bool cache_hit = false;
1704 
1705    struct tu_shader_key keys[ARRAY_SIZE(stage_infos)] = { };
1706    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1707         stage < ARRAY_SIZE(keys); stage = (gl_shader_stage) (stage+1)) {
1708       const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info = NULL;
1709       if (stage_infos[stage])
1710          subgroup_info = vk_find_struct_const(stage_infos[stage],
1711                                               PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
1712       bool allow_varying_subgroup_size =
1713          !stage_infos[stage] ||
1714          (stage_infos[stage]->flags &
1715           VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT);
1716       bool require_full_subgroups =
1717          stage_infos[stage] &&
1718          (stage_infos[stage]->flags &
1719           VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT);
1720       tu_shader_key_subgroup_size(&keys[stage], allow_varying_subgroup_size,
1721                                   require_full_subgroups, subgroup_info,
1722                                   builder->device);
1723    }
1724 
1725    if (builder->create_flags &
1726        VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT) {
1727       for (unsigned i = 0; i < builder->num_libraries; i++) {
1728          struct tu_graphics_lib_pipeline *library = builder->libraries[i];
1729 
1730          for (unsigned j = 0; j < ARRAY_SIZE(library->shaders); j++) {
1731             if (library->shaders[j].nir) {
1732                assert(!nir[j]);
1733                nir[j] = nir_shader_clone(builder->mem_ctx,
1734                      library->shaders[j].nir);
1735                keys[j] = library->shaders[j].key;
1736                must_compile = true;
1737             }
1738          }
1739       }
1740    }
1741 
1742    struct tu_nir_shaders *nir_shaders = NULL;
1743    if (!must_compile)
1744       goto done;
1745 
1746    if (builder->state &
1747        VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
1748       keys[MESA_SHADER_VERTEX].multiview_mask =
1749          builder->graphics_state.rp->view_mask;
1750    }
1751 
1752    if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
1753       keys[MESA_SHADER_FRAGMENT].multiview_mask =
1754          builder->graphics_state.rp->view_mask;
1755       keys[MESA_SHADER_FRAGMENT].fragment_density_map =
1756          builder->fragment_density_map;
1757       keys[MESA_SHADER_FRAGMENT].unscaled_input_fragcoord =
1758          builder->unscaled_input_fragcoord;
1759 
1760       const VkPipelineMultisampleStateCreateInfo *msaa_info =
1761          builder->create_info->pMultisampleState;
1762 
1763       /* The 1.3.215 spec says:
1764        *
1765        *    Sample shading can be used to specify a minimum number of unique
1766        *    samples to process for each fragment. If sample shading is enabled,
1767        *    an implementation must provide a minimum of
1768        *
1769        *       max(ceil(minSampleShadingFactor * totalSamples), 1)
1770        *
1771        *    unique associated data for each fragment, where
1772        *    minSampleShadingFactor is the minimum fraction of sample shading.
1773        *
1774        * The definition is pretty much the same as OpenGL's GL_SAMPLE_SHADING.
1775        * They both require unique associated data.
1776        *
1777        * There are discussions to change the definition, such that
1778        * sampleShadingEnable does not imply unique associated data.  Before the
1779        * discussions are settled and before apps (i.e., ANGLE) are fixed to
1780        * follow the new and incompatible definition, we should stick to the
1781        * current definition.
1782        *
1783        * Note that ir3_shader_key::sample_shading is not actually used by ir3,
1784        * just checked in tu6_emit_fs_inputs.  We will also copy the value to
1785        * tu_shader_key::force_sample_interp in a bit.
1786        */
1787       keys[MESA_SHADER_FRAGMENT].force_sample_interp =
1788          !builder->rasterizer_discard && msaa_info && msaa_info->sampleShadingEnable;
1789    }
1790 
1791    unsigned char pipeline_sha1[20];
1792    tu_hash_shaders(pipeline_sha1, builder->create_flags, stage_infos, nir,
1793                    &builder->layout, keys, builder->state, compiler);
1794 
1795    unsigned char nir_sha1[21];
1796    memcpy(nir_sha1, pipeline_sha1, sizeof(pipeline_sha1));
1797    nir_sha1[20] = 'N';
1798 
1799    if (!executable_info) {
1800       cache_hit = true;
1801       bool application_cache_hit = false;
1802 
1803       unsigned char shader_sha1[21];
1804       memcpy(shader_sha1, pipeline_sha1, sizeof(pipeline_sha1));
1805 
1806       for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < ARRAY_SIZE(nir);
1807            stage = (gl_shader_stage) (stage + 1)) {
1808          if (stage_infos[stage] || nir[stage]) {
1809             bool shader_application_cache_hit;
1810             shader_sha1[20] = (unsigned char) stage;
1811             shaders[stage] =
1812                tu_pipeline_cache_lookup(builder->cache, &shader_sha1,
1813                                         sizeof(shader_sha1),
1814                                         &shader_application_cache_hit);
1815             if (!shaders[stage]) {
1816                cache_hit = false;
1817                break;
1818             }
1819             application_cache_hit &= shader_application_cache_hit;
1820          }
1821       }
1822 
1823       /* If the user asks us to keep the NIR around, we need to have it for a
1824        * successful cache hit. If we only have a "partial" cache hit, then we
1825        * still need to recompile in order to get the NIR.
1826        */
1827       if (cache_hit &&
1828           (builder->create_flags &
1829            VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT)) {
1830          bool nir_application_cache_hit = false;
1831          nir_shaders =
1832             tu_nir_cache_lookup(builder->cache, &nir_sha1,
1833                                 sizeof(nir_sha1),
1834                                 &nir_application_cache_hit);
1835 
1836          application_cache_hit &= nir_application_cache_hit;
1837          cache_hit &= !!nir_shaders;
1838       }
1839 
1840       if (application_cache_hit && builder->cache != builder->device->mem_cache) {
1841          pipeline_feedback.flags |=
1842             VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
1843       }
1844    }
1845 
1846    if (!cache_hit) {
1847       if (builder->create_flags &
1848           VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR) {
1849          return VK_PIPELINE_COMPILE_REQUIRED;
1850       }
1851 
1852       result = tu_compile_shaders(builder->device,
1853                                   builder->create_flags,
1854                                   stage_infos,
1855                                   nir,
1856                                   keys,
1857                                   &builder->layout,
1858                                   pipeline_sha1,
1859                                   shaders,
1860                                   executable_info ? nir_initial_disasm : NULL,
1861                                   pipeline->executables_mem_ctx,
1862                                   retain_nir ? post_link_nir : NULL,
1863                                   stage_feedbacks);
1864 
1865       if (result != VK_SUCCESS)
1866          goto fail;
1867 
1868       if (retain_nir) {
1869          nir_shaders =
1870             tu_nir_shaders_init(builder->device, &nir_sha1, sizeof(nir_sha1));
1871          for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1872               stage < ARRAY_SIZE(nir); stage = (gl_shader_stage) (stage + 1)) {
1873             if (!post_link_nir[stage])
1874                continue;
1875 
1876             nir_shaders->nir[stage] = post_link_nir[stage];
1877          }
1878 
1879          nir_shaders = tu_nir_cache_insert(builder->cache, nir_shaders);
1880       }
1881 
1882       for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < ARRAY_SIZE(nir);
1883            stage = (gl_shader_stage) (stage + 1)) {
1884          if (!nir[stage])
1885             continue;
1886 
1887          shaders[stage] = tu_pipeline_cache_insert(builder->cache, shaders[stage]);
1888       }
1889    }
1890 
1891 done:
1892 
1893    /* Create empty shaders which contain the draw states to initialize
1894     * registers for unused shader stages.
1895     */
1896    if (builder->state &
1897        VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
1898       if (!shaders[MESA_SHADER_TESS_CTRL]) {
1899          shaders[MESA_SHADER_TESS_CTRL] = builder->device->empty_tcs;
1900          vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_TESS_CTRL]->base);
1901       }
1902       if (!shaders[MESA_SHADER_TESS_EVAL]) {
1903          shaders[MESA_SHADER_TESS_EVAL] = builder->device->empty_tes;
1904          vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_TESS_EVAL]->base);
1905       }
1906       if (!shaders[MESA_SHADER_GEOMETRY]) {
1907          shaders[MESA_SHADER_GEOMETRY] = builder->device->empty_gs;
1908          vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_GEOMETRY]->base);
1909       }
1910    }
1911 
1912    if (builder->state &
1913        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
1914       if (!shaders[MESA_SHADER_FRAGMENT]) {
1915          shaders[MESA_SHADER_FRAGMENT] =
1916             builder->fragment_density_map ?
1917             builder->device->empty_fs_fdm : builder->device->empty_fs;
1918          vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_FRAGMENT]->base);
1919       }
1920    }
1921 
1922    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1923         stage < ARRAY_SIZE(nir); stage = (gl_shader_stage) (stage + 1)) {
1924       if (shaders[stage] && shaders[stage]->variant) {
1925          tu_append_executable(pipeline, shaders[stage]->variant,
1926                               nir_initial_disasm[stage]);
1927       }
1928    }
1929 
1930    /* We may have deduplicated a cache entry, in which case our original
1931     * post_link_nir may be gone.
1932     */
1933    if (nir_shaders) {
1934       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1935            stage < ARRAY_SIZE(nir); stage = (gl_shader_stage) (stage + 1)) {
1936          if (nir_shaders->nir[stage]) {
1937             post_link_nir[stage] = nir_shaders->nir[stage];
1938          }
1939       }
1940    }
1941 
1942    /* In the case where we're building a library without link-time
1943     * optimization but with sub-libraries that retain LTO info, we should
1944     * retain it ourselves in case another pipeline includes us with LTO.
1945     */
1946    for (unsigned i = 0; i < builder->num_libraries; i++) {
1947       struct tu_graphics_lib_pipeline *library = builder->libraries[i];
1948       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1949            stage < ARRAY_SIZE(library->shaders);
1950            stage = (gl_shader_stage) (stage + 1)) {
1951          if (!post_link_nir[stage] && library->shaders[stage].nir) {
1952             post_link_nir[stage] = library->shaders[stage].nir;
1953             keys[stage] = library->shaders[stage].key;
1954          }
1955 
1956          if (!shaders[stage] && library->base.shaders[stage]) {
1957             shaders[stage] = library->base.shaders[stage];
1958             vk_pipeline_cache_object_ref(&shaders[stage]->base);
1959          }
1960       }
1961    }
1962 
1963    if (shaders[MESA_SHADER_VERTEX]) {
1964       const struct ir3_shader_variant *vs =
1965          shaders[MESA_SHADER_VERTEX]->variant;
1966 
1967       if (!vs->stream_output.num_outputs && ir3_has_binning_vs(&vs->key)) {
1968          tu_append_executable(pipeline, vs->binning, NULL);
1969       }
1970    }
1971 
1972    if (pipeline_contains_all_shader_state(pipeline)) {
1973       /* It doesn't make much sense to use RETAIN_LINK_TIME_OPTIMIZATION_INFO
1974        * when compiling all stages, but make sure we don't leak.
1975        */
1976       if (nir_shaders)
1977          vk_pipeline_cache_object_unref(&builder->device->vk,
1978                                         &nir_shaders->base);
1979    } else {
1980       struct tu_graphics_lib_pipeline *library =
1981          tu_pipeline_to_graphics_lib(pipeline);
1982       library->nir_shaders = nir_shaders;
1983       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1984            stage < ARRAY_SIZE(library->shaders);
1985            stage = (gl_shader_stage) (stage + 1)) {
1986          library->shaders[stage].nir = post_link_nir[stage];
1987          library->shaders[stage].key = keys[stage];
1988       }
1989    }
1990 
1991    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1992         stage < ARRAY_SIZE(shaders); stage = (gl_shader_stage) (stage + 1)) {
1993       pipeline->shaders[stage] = shaders[stage];
1994       if (shaders[stage])
1995          pipeline->active_desc_sets |= shaders[stage]->active_desc_sets;
1996    }
1997 
1998    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
1999    if (creation_feedback) {
2000       *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
2001 
2002       for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
2003          gl_shader_stage s =
2004             vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
2005          creation_feedback->pPipelineStageCreationFeedbacks[i] = stage_feedbacks[s];
2006       }
2007    }
2008 
2009    return VK_SUCCESS;
2010 
2011 fail:
2012    if (nir_shaders)
2013       vk_pipeline_cache_object_unref(&builder->device->vk,
2014                                      &nir_shaders->base);
2015 
2016    return result;
2017 }
2018 
2019 static void
tu_pipeline_builder_parse_libraries(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2020 tu_pipeline_builder_parse_libraries(struct tu_pipeline_builder *builder,
2021                                     struct tu_pipeline *pipeline)
2022 {
2023    const VkPipelineLibraryCreateInfoKHR *library_info =
2024       vk_find_struct_const(builder->create_info->pNext,
2025                            PIPELINE_LIBRARY_CREATE_INFO_KHR);
2026 
2027    if (library_info) {
2028       assert(library_info->libraryCount <= MAX_LIBRARIES);
2029       builder->num_libraries = library_info->libraryCount;
2030       for (unsigned i = 0; i < library_info->libraryCount; i++) {
2031          VK_FROM_HANDLE(tu_pipeline, library, library_info->pLibraries[i]);
2032          builder->libraries[i] = tu_pipeline_to_graphics_lib(library);
2033       }
2034    }
2035 
2036    /* Merge in the state from libraries. The program state is a bit special
2037     * and is handled separately.
2038     */
2039    if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB)
2040       tu_pipeline_to_graphics_lib(pipeline)->state = builder->state;
2041    for (unsigned i = 0; i < builder->num_libraries; i++) {
2042       struct tu_graphics_lib_pipeline *library = builder->libraries[i];
2043       if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB)
2044          tu_pipeline_to_graphics_lib(pipeline)->state |= library->state;
2045 
2046       if (library->state &
2047           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
2048          pipeline->output = library->base.output;
2049          pipeline->lrz_blend.reads_dest |= library->base.lrz_blend.reads_dest;
2050          pipeline->lrz_blend.valid |= library->base.lrz_blend.valid;
2051          pipeline->prim_order = library->base.prim_order;
2052       }
2053 
2054       if ((library->state &
2055            VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
2056           (library->state &
2057            VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
2058          pipeline->prim_order = library->base.prim_order;
2059       }
2060 
2061       pipeline->set_state_mask |= library->base.set_state_mask;
2062 
2063       u_foreach_bit (i, library->base.set_state_mask) {
2064          pipeline->dynamic_state[i] = library->base.dynamic_state[i];
2065       }
2066 
2067       if (contains_all_shader_state(library->state)) {
2068          pipeline->program = library->base.program;
2069          pipeline->load_state = library->base.load_state;
2070          for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); i++) {
2071             if (library->base.shaders[i]) {
2072                pipeline->shaders[i] = library->base.shaders[i];
2073                vk_pipeline_cache_object_ref(&pipeline->shaders[i]->base);
2074             }
2075          }
2076       }
2077 
2078       BITSET_OR(pipeline->static_state_mask, pipeline->static_state_mask,
2079                 library->base.static_state_mask);
2080 
2081       vk_graphics_pipeline_state_merge(&builder->graphics_state,
2082                                        &library->graphics_state);
2083    }
2084 }
2085 
2086 static void
tu_pipeline_builder_parse_layout(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2087 tu_pipeline_builder_parse_layout(struct tu_pipeline_builder *builder,
2088                                  struct tu_pipeline *pipeline)
2089 {
2090    VK_FROM_HANDLE(tu_pipeline_layout, layout, builder->create_info->layout);
2091 
2092    if (layout) {
2093       /* Note: it's still valid to have a layout even if there are libraries.
2094        * This allows the app to e.g. overwrite an INDEPENDENT_SET layout with
2095        * a non-INDEPENDENT_SET layout which may make us use a faster path,
2096        * currently this just affects dynamic offset descriptors.
2097        */
2098       builder->layout = *layout;
2099    } else {
2100       for (unsigned i = 0; i < builder->num_libraries; i++) {
2101          struct tu_graphics_lib_pipeline *library = builder->libraries[i];
2102          builder->layout.num_sets = MAX2(builder->layout.num_sets,
2103                                          library->num_sets);
2104          assert(builder->layout.num_sets <= builder->device->physical_device->usable_sets);
2105          for (unsigned j = 0; j < library->num_sets; j++) {
2106             builder->layout.set[i].layout = library->layouts[i];
2107          }
2108 
2109          builder->layout.push_constant_size = library->push_constant_size;
2110       }
2111 
2112       tu_pipeline_layout_init(&builder->layout);
2113    }
2114 
2115    if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB) {
2116       struct tu_graphics_lib_pipeline *library =
2117          tu_pipeline_to_graphics_lib(pipeline);
2118       library->num_sets = builder->layout.num_sets;
2119       for (unsigned i = 0; i < library->num_sets; i++) {
2120          library->layouts[i] = builder->layout.set[i].layout;
2121          if (library->layouts[i])
2122             vk_descriptor_set_layout_ref(&library->layouts[i]->vk);
2123       }
2124       library->push_constant_size = builder->layout.push_constant_size;
2125    }
2126 }
2127 
2128 static void
tu_pipeline_set_linkage(struct tu_program_descriptor_linkage * link,struct tu_const_state * const_state,const struct ir3_shader_variant * v)2129 tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link,
2130                         struct tu_const_state *const_state,
2131                         const struct ir3_shader_variant *v)
2132 {
2133    link->const_state = *ir3_const_state(v);
2134    link->tu_const_state = *const_state;
2135    link->constlen = v->constlen;
2136 }
2137 
2138 template <chip CHIP>
2139 static void
tu_emit_program_state(struct tu_cs * sub_cs,struct tu_program_state * prog,struct tu_shader ** shaders)2140 tu_emit_program_state(struct tu_cs *sub_cs,
2141                       struct tu_program_state *prog,
2142                       struct tu_shader **shaders)
2143 {
2144    struct tu_device *dev = sub_cs->device;
2145    struct tu_cs prog_cs;
2146 
2147    const struct ir3_shader_variant *variants[MESA_SHADER_STAGES];
2148    struct tu_draw_state draw_states[MESA_SHADER_STAGES];
2149 
2150    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2151         stage < ARRAY_SIZE(variants); stage = (gl_shader_stage) (stage+1)) {
2152       variants[stage] = shaders[stage] ? shaders[stage]->variant : NULL;
2153    }
2154 
2155    uint32_t safe_variants =
2156       ir3_trim_constlen(variants, dev->compiler);
2157 
2158    unsigned dynamic_descriptor_sizes[MAX_SETS] = { };
2159 
2160    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2161         stage < ARRAY_SIZE(variants); stage = (gl_shader_stage) (stage+1)) {
2162       if (shaders[stage]) {
2163          if (safe_variants & (1u << stage)) {
2164             variants[stage] = shaders[stage]->safe_const_variant;
2165             draw_states[stage] = shaders[stage]->safe_const_state;
2166          } else {
2167             draw_states[stage] = shaders[stage]->state;
2168          }
2169 
2170          for (unsigned i = 0; i < MAX_SETS; i++) {
2171             if (shaders[stage]->dynamic_descriptor_sizes[i] >= 0) {
2172                dynamic_descriptor_sizes[i] =
2173                   shaders[stage]->dynamic_descriptor_sizes[i];
2174             }
2175          }
2176       }
2177    }
2178 
2179    for (unsigned i = 0; i < ARRAY_SIZE(variants); i++) {
2180       if (!variants[i])
2181          continue;
2182 
2183       tu_pipeline_set_linkage(&prog->link[i],
2184                               &shaders[i]->const_state,
2185                               variants[i]);
2186 
2187       struct tu_push_constant_range *push_consts =
2188          &shaders[i]->const_state.push_consts;
2189       if (push_consts->type == IR3_PUSH_CONSTS_SHARED ||
2190           push_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
2191          prog->shared_consts = *push_consts;
2192       }
2193    }
2194 
2195    unsigned dynamic_descriptor_offset = 0;
2196    for (unsigned i = 0; i < MAX_SETS; i++) {
2197       prog->dynamic_descriptor_offsets[i] = dynamic_descriptor_offset;
2198       dynamic_descriptor_offset += dynamic_descriptor_sizes[i];
2199    }
2200 
2201    /* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything
2202     * else that could depend on that state (like push constants)
2203     *
2204     * Note also that this always uses the full VS even in binning pass.  The
2205     * binning pass variant has the same const layout as the full VS, and
2206     * the constlen for the VS will be the same or greater than the constlen
2207     * for the binning pass variant.  It is required that the constlen state
2208     * matches between binning and draw passes, as some parts of the push
2209     * consts are emitted in state groups that are shared between the binning
2210     * and draw passes.
2211     */
2212    tu_cs_begin_sub_stream(sub_cs, 512, &prog_cs);
2213    tu6_emit_program_config<CHIP>(&prog_cs, prog, shaders, variants);
2214    prog->config_state = tu_cs_end_draw_state(sub_cs, &prog_cs);
2215 
2216    prog->vs_state = draw_states[MESA_SHADER_VERTEX];
2217 
2218   /* Don't use the binning pass variant when GS is present because we don't
2219    * support compiling correct binning pass variants with GS.
2220    */
2221    if (variants[MESA_SHADER_GEOMETRY]) {
2222       prog->vs_binning_state = prog->vs_state;
2223    } else {
2224       prog->vs_binning_state =
2225          shaders[MESA_SHADER_VERTEX]->binning_state;
2226    }
2227 
2228    prog->hs_state = draw_states[MESA_SHADER_TESS_CTRL];
2229    prog->ds_state = draw_states[MESA_SHADER_TESS_EVAL];
2230    prog->gs_state = draw_states[MESA_SHADER_GEOMETRY];
2231    prog->gs_binning_state =
2232       shaders[MESA_SHADER_GEOMETRY]->binning_state;
2233    prog->fs_state = draw_states[MESA_SHADER_FRAGMENT];
2234 
2235    const struct ir3_shader_variant *vs = variants[MESA_SHADER_VERTEX];
2236    const struct ir3_shader_variant *hs = variants[MESA_SHADER_TESS_CTRL];
2237    const struct ir3_shader_variant *ds = variants[MESA_SHADER_TESS_EVAL];
2238    const struct ir3_shader_variant *gs = variants[MESA_SHADER_GEOMETRY];
2239    const struct ir3_shader_variant *fs = variants[MESA_SHADER_FRAGMENT];
2240 
2241    tu_cs_begin_sub_stream(sub_cs, 512, &prog_cs);
2242    tu6_emit_vpc<CHIP>(&prog_cs, vs, hs, ds, gs, fs);
2243    prog->vpc_state = tu_cs_end_draw_state(sub_cs, &prog_cs);
2244 
2245    const struct ir3_shader_variant *last_shader;
2246    if (gs)
2247       last_shader = gs;
2248    else if (ds)
2249       last_shader = ds;
2250    else
2251       last_shader = vs;
2252 
2253    prog->per_view_viewport =
2254       !last_shader->writes_viewport &&
2255       shaders[MESA_SHADER_FRAGMENT]->fs.has_fdm &&
2256       dev->physical_device->info->a6xx.has_per_view_viewport;
2257 }
2258 
2259 static const enum mesa_vk_dynamic_graphics_state tu_vertex_input_state[] = {
2260    MESA_VK_DYNAMIC_VI,
2261 };
2262 
2263 template <chip CHIP>
2264 static unsigned
tu6_vertex_input_size(struct tu_device * dev,const struct vk_vertex_input_state * vi)2265 tu6_vertex_input_size(struct tu_device *dev,
2266                       const struct vk_vertex_input_state *vi)
2267 {
2268    return 1 + 2 * util_last_bit(vi->attributes_valid);
2269 }
2270 
2271 template <chip CHIP>
2272 static void
tu6_emit_vertex_input(struct tu_cs * cs,const struct vk_vertex_input_state * vi)2273 tu6_emit_vertex_input(struct tu_cs *cs,
2274                       const struct vk_vertex_input_state *vi)
2275 {
2276    unsigned attr_count = util_last_bit(vi->attributes_valid);
2277    if (attr_count != 0)
2278       tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DECODE_INSTR(0), attr_count * 2);
2279 
2280    for (uint32_t loc = 0; loc < attr_count; loc++) {
2281       const struct vk_vertex_attribute_state *attr = &vi->attributes[loc];
2282 
2283       if (vi->attributes_valid & (1u << loc)) {
2284          const struct vk_vertex_binding_state *binding =
2285             &vi->bindings[attr->binding];
2286 
2287          enum pipe_format pipe_format = vk_format_to_pipe_format(attr->format);
2288          const struct tu_native_format format = tu6_format_vtx(pipe_format);
2289          tu_cs_emit(cs, A6XX_VFD_DECODE_INSTR(0,
2290                           .idx = attr->binding,
2291                           .offset = attr->offset,
2292                           .instanced = binding->input_rate == VK_VERTEX_INPUT_RATE_INSTANCE,
2293                           .format = format.fmt,
2294                           .swap = format.swap,
2295                           .unk30 = 1,
2296                           ._float = !util_format_is_pure_integer(pipe_format)).value);
2297          tu_cs_emit(cs, A6XX_VFD_DECODE_STEP_RATE(0, binding->divisor).value);
2298       } else {
2299          tu_cs_emit(cs, 0);
2300          tu_cs_emit(cs, 0);
2301       }
2302    }
2303 }
2304 
2305 static const enum mesa_vk_dynamic_graphics_state tu_vertex_stride_state[] = {
2306    MESA_VK_DYNAMIC_VI_BINDINGS_VALID,
2307    MESA_VK_DYNAMIC_VI_BINDING_STRIDES,
2308 };
2309 
2310 template <chip CHIP>
2311 static unsigned
tu6_vertex_stride_size(struct tu_device * dev,const struct vk_vertex_input_state * vi)2312 tu6_vertex_stride_size(struct tu_device *dev,
2313                        const struct vk_vertex_input_state *vi)
2314 {
2315    return 1 + 2 * util_last_bit(vi->bindings_valid);
2316 }
2317 
2318 template <chip CHIP>
2319 static void
tu6_emit_vertex_stride(struct tu_cs * cs,const struct vk_vertex_input_state * vi)2320 tu6_emit_vertex_stride(struct tu_cs *cs, const struct vk_vertex_input_state *vi)
2321 {
2322    if (vi->bindings_valid) {
2323       unsigned bindings_count = util_last_bit(vi->bindings_valid);
2324       tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 2 * bindings_count);
2325       for (unsigned i = 0; i < bindings_count; i++) {
2326          tu_cs_emit(cs, REG_A6XX_VFD_FETCH_STRIDE(i));
2327          tu_cs_emit(cs, vi->bindings[i].stride);
2328       }
2329    }
2330 }
2331 
2332 template <chip CHIP>
2333 static unsigned
tu6_vertex_stride_size_dyn(struct tu_device * dev,const uint16_t * vi_binding_stride,uint32_t bindings_valid)2334 tu6_vertex_stride_size_dyn(struct tu_device *dev,
2335                            const uint16_t *vi_binding_stride,
2336                            uint32_t bindings_valid)
2337 {
2338    return 1 + 2 * util_last_bit(bindings_valid);
2339 }
2340 
2341 template <chip CHIP>
2342 static void
tu6_emit_vertex_stride_dyn(struct tu_cs * cs,const uint16_t * vi_binding_stride,uint32_t bindings_valid)2343 tu6_emit_vertex_stride_dyn(struct tu_cs *cs, const uint16_t *vi_binding_stride,
2344                            uint32_t bindings_valid)
2345 {
2346    if (bindings_valid) {
2347       unsigned bindings_count = util_last_bit(bindings_valid);
2348       tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 2 * bindings_count);
2349       for (unsigned i = 0; i < bindings_count; i++) {
2350          tu_cs_emit(cs, REG_A6XX_VFD_FETCH_STRIDE(i));
2351          tu_cs_emit(cs, vi_binding_stride[i]);
2352       }
2353    }
2354 }
2355 
2356 static const enum mesa_vk_dynamic_graphics_state tu_viewport_state[] = {
2357    MESA_VK_DYNAMIC_VP_VIEWPORTS,
2358    MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT,
2359    MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE,
2360    MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
2361 };
2362 
2363 template <chip CHIP>
2364 static unsigned
tu6_viewport_size(struct tu_device * dev,const struct vk_viewport_state * vp,const struct vk_rasterization_state * rs)2365 tu6_viewport_size(struct tu_device *dev,
2366                   const struct vk_viewport_state *vp,
2367                   const struct vk_rasterization_state *rs)
2368 {
2369    return 1 + vp->viewport_count * 6 + 1 + vp->viewport_count * 2 +
2370       1 + vp->viewport_count * 2 + 5;
2371 }
2372 
2373 template <chip CHIP>
2374 static void
tu6_emit_viewport(struct tu_cs * cs,const struct vk_viewport_state * vp,const struct vk_rasterization_state * rs)2375 tu6_emit_viewport(struct tu_cs *cs,
2376                   const struct vk_viewport_state *vp,
2377                   const struct vk_rasterization_state *rs)
2378 {
2379    VkExtent2D guardband = {511, 511};
2380 
2381    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET(0), vp->viewport_count * 6);
2382    for (uint32_t i = 0; i < vp->viewport_count; i++) {
2383       const VkViewport *viewport = &vp->viewports[i];
2384       float offsets[3];
2385       float scales[3];
2386       scales[0] = viewport->width / 2.0f;
2387       scales[1] = viewport->height / 2.0f;
2388       if (vp->depth_clip_negative_one_to_one) {
2389          scales[2] = 0.5 * (viewport->maxDepth - viewport->minDepth);
2390       } else {
2391          scales[2] = viewport->maxDepth - viewport->minDepth;
2392       }
2393 
2394       offsets[0] = viewport->x + scales[0];
2395       offsets[1] = viewport->y + scales[1];
2396       if (vp->depth_clip_negative_one_to_one) {
2397          offsets[2] = 0.5 * (viewport->minDepth + viewport->maxDepth);
2398       } else {
2399          offsets[2] = viewport->minDepth;
2400       }
2401 
2402       for (uint32_t j = 0; j < 3; j++) {
2403          tu_cs_emit(cs, fui(offsets[j]));
2404          tu_cs_emit(cs, fui(scales[j]));
2405       }
2406 
2407       guardband.width =
2408          MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false));
2409       guardband.height =
2410          MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false));
2411    }
2412 
2413    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0), vp->viewport_count * 2);
2414    for (uint32_t i = 0; i < vp->viewport_count; i++) {
2415       const VkViewport *viewport = &vp->viewports[i];
2416       VkOffset2D min;
2417       VkOffset2D max;
2418       min.x = (int32_t) viewport->x;
2419       max.x = (int32_t) ceilf(viewport->x + viewport->width);
2420       if (viewport->height >= 0.0f) {
2421          min.y = (int32_t) viewport->y;
2422          max.y = (int32_t) ceilf(viewport->y + viewport->height);
2423       } else {
2424          min.y = (int32_t)(viewport->y + viewport->height);
2425          max.y = (int32_t) ceilf(viewport->y);
2426       }
2427       /* the spec allows viewport->height to be 0.0f */
2428       if (min.y == max.y)
2429          max.y++;
2430       /* allow viewport->width = 0.0f for un-initialized viewports: */
2431       if (min.x == max.x)
2432          max.x++;
2433 
2434       min.x = MAX2(min.x, 0);
2435       min.y = MAX2(min.y, 0);
2436       max.x = MAX2(max.x, 1);
2437       max.y = MAX2(max.y, 1);
2438 
2439       assert(min.x < max.x);
2440       assert(min.y < max.y);
2441 
2442       tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(min.x) |
2443                      A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(min.y));
2444       tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_X(max.x - 1) |
2445                      A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_Y(max.y - 1));
2446    }
2447 
2448    /* A7XX+ doesn't clamp to [0,1] with disabled depth clamp, to support
2449     * VK_EXT_depth_clamp_zero_one we have to always enable clamp and manually
2450     * set range to [0,1] when rs->depth_clamp_enable is false.
2451     */
2452    bool zero_one_depth_clamp = CHIP >= A7XX && !rs->depth_clamp_enable;
2453 
2454    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_Z_CLAMP(0), vp->viewport_count * 2);
2455    for (uint32_t i = 0; i < vp->viewport_count; i++) {
2456       const VkViewport *viewport = &vp->viewports[i];
2457       if (zero_one_depth_clamp) {
2458          tu_cs_emit(cs, fui(0.0f));
2459          tu_cs_emit(cs, fui(1.0f));
2460       } else {
2461          tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth)));
2462          tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth)));
2463       }
2464    }
2465    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ, 1);
2466    tu_cs_emit(cs, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband.width) |
2467                   A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband.height));
2468 
2469    /* TODO: what to do about this and multi viewport ? */
2470    float z_clamp_min = vp->viewport_count ? MIN2(vp->viewports[0].minDepth, vp->viewports[0].maxDepth) : 0;
2471    float z_clamp_max = vp->viewport_count ? MAX2(vp->viewports[0].minDepth, vp->viewports[0].maxDepth) : 0;
2472    if (zero_one_depth_clamp) {
2473       z_clamp_min = 0.0f;
2474       z_clamp_max = 1.0f;
2475    }
2476 
2477    tu_cs_emit_regs(cs,
2478                    A6XX_RB_Z_CLAMP_MIN(z_clamp_min),
2479                    A6XX_RB_Z_CLAMP_MAX(z_clamp_max));
2480 }
2481 
2482 struct apply_viewport_state {
2483    struct vk_viewport_state vp;
2484    struct vk_rasterization_state rs;
2485    bool share_scale;
2486 };
2487 
2488 /* It's a hardware restriction that the window offset (i.e. bin.offset) must
2489  * be the same for all views. This means that GMEM coordinates cannot be a
2490  * simple scaling of framebuffer coordinates, because this would require us to
2491  * scale the window offset and the scale may be different per view. Instead we
2492  * have to apply a per-bin offset to the GMEM coordinate transform to make
2493  * sure that the window offset maps to itself. Specifically we need an offset
2494  * o to the transform:
2495  *
2496  * x' = s * x + o
2497  *
2498  * so that when we plug in the bin start b_s:
2499  *
2500  * b_s = s * b_s + o
2501  *
2502  * and we get:
2503  *
2504  * o = b_s - s * b_s
2505  *
2506  * We use this form exactly, because we know the bin offset is a multiple of
2507  * the frag area so s * b_s is an integer and we can compute an exact result
2508  * easily.
2509  */
2510 
2511 VkOffset2D
tu_fdm_per_bin_offset(VkExtent2D frag_area,VkRect2D bin)2512 tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin)
2513 {
2514    assert(bin.offset.x % frag_area.width == 0);
2515    assert(bin.offset.y % frag_area.height == 0);
2516 
2517    return (VkOffset2D) {
2518       bin.offset.x - bin.offset.x / frag_area.width,
2519       bin.offset.y - bin.offset.y / frag_area.height
2520    };
2521 }
2522 
2523 static void
fdm_apply_viewports(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)2524 fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
2525                     VkRect2D bin, unsigned views, VkExtent2D *frag_areas)
2526 {
2527    const struct apply_viewport_state *state =
2528       (const struct apply_viewport_state *)data;
2529 
2530    struct vk_viewport_state vp = state->vp;
2531 
2532    for (unsigned i = 0; i < state->vp.viewport_count; i++) {
2533       /* Note: If we're using shared scaling, the scale should already be the
2534        * same across all views, we can pick any view. However the number
2535        * of viewports and number of views is not guaranteed the same, so we
2536        * need to pick the 0'th view which always exists to be safe.
2537        *
2538        * Conversly, if we're not using shared scaling then the rasterizer in
2539        * the original pipeline is using only the first viewport, so we need to
2540        * replicate it across all viewports.
2541        */
2542       VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
2543       VkViewport viewport =
2544          state->share_scale ? state->vp.viewports[i] : state->vp.viewports[0];
2545       if (frag_area.width == 1 && frag_area.height == 1) {
2546          vp.viewports[i] = viewport;
2547          continue;
2548       }
2549 
2550       float scale_x = (float) 1.0f / frag_area.width;
2551       float scale_y = (float) 1.0f / frag_area.height;
2552 
2553       vp.viewports[i].minDepth = viewport.minDepth;
2554       vp.viewports[i].maxDepth = viewport.maxDepth;
2555       vp.viewports[i].width = viewport.width * scale_x;
2556       vp.viewports[i].height = viewport.height * scale_y;
2557 
2558       VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
2559 
2560       vp.viewports[i].x = scale_x * viewport.x + offset.x;
2561       vp.viewports[i].y = scale_y * viewport.y + offset.y;
2562    }
2563 
2564    TU_CALLX(cs->device, tu6_emit_viewport)(cs, &vp, &state->rs);
2565 }
2566 
2567 static void
tu6_emit_viewport_fdm(struct tu_cs * cs,struct tu_cmd_buffer * cmd,const struct vk_viewport_state * vp,const struct vk_rasterization_state * rs)2568 tu6_emit_viewport_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
2569                       const struct vk_viewport_state *vp,
2570                       const struct vk_rasterization_state *rs)
2571 {
2572    unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
2573    struct apply_viewport_state state = {
2574       .vp = *vp,
2575       .rs = *rs,
2576       .share_scale = !cmd->state.per_view_viewport,
2577    };
2578    if (!state.share_scale)
2579       state.vp.viewport_count = num_views;
2580    unsigned size = TU_CALLX(cmd->device, tu6_viewport_size)(cmd->device, &state.vp, &state.rs);
2581    tu_cs_begin_sub_stream(&cmd->sub_cs, size, cs);
2582    tu_create_fdm_bin_patchpoint(cmd, cs, size, fdm_apply_viewports, state);
2583 }
2584 
2585 static const enum mesa_vk_dynamic_graphics_state tu_scissor_state[] = {
2586    MESA_VK_DYNAMIC_VP_SCISSORS,
2587    MESA_VK_DYNAMIC_VP_SCISSOR_COUNT,
2588 };
2589 
2590 template <chip CHIP>
2591 static unsigned
tu6_scissor_size(struct tu_device * dev,const struct vk_viewport_state * vp)2592 tu6_scissor_size(struct tu_device *dev, const struct vk_viewport_state *vp)
2593 {
2594    return 1 + vp->scissor_count * 2;
2595 }
2596 
2597 template <chip CHIP>
2598 void
tu6_emit_scissor(struct tu_cs * cs,const struct vk_viewport_state * vp)2599 tu6_emit_scissor(struct tu_cs *cs, const struct vk_viewport_state *vp)
2600 {
2601    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), vp->scissor_count * 2);
2602 
2603    for (uint32_t i = 0; i < vp->scissor_count; i++) {
2604       const VkRect2D *scissor = &vp->scissors[i];
2605 
2606       uint32_t min_x = scissor->offset.x;
2607       uint32_t min_y = scissor->offset.y;
2608       uint32_t max_x = min_x + scissor->extent.width - 1;
2609       uint32_t max_y = min_y + scissor->extent.height - 1;
2610 
2611       if (!scissor->extent.width || !scissor->extent.height) {
2612          min_x = min_y = 1;
2613          max_x = max_y = 0;
2614       } else {
2615          /* avoid overflow */
2616          uint32_t scissor_max = BITFIELD_MASK(15);
2617          min_x = MIN2(scissor_max, min_x);
2618          min_y = MIN2(scissor_max, min_y);
2619          max_x = MIN2(scissor_max, max_x);
2620          max_y = MIN2(scissor_max, max_y);
2621       }
2622 
2623       tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(min_x) |
2624                      A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(min_y));
2625       tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(max_x) |
2626                      A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(max_y));
2627    }
2628 }
2629 
2630 static void
fdm_apply_scissors(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)2631 fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
2632                    VkRect2D bin, unsigned views, VkExtent2D *frag_areas)
2633 {
2634    const struct apply_viewport_state *state =
2635       (const struct apply_viewport_state *)data;
2636 
2637    struct vk_viewport_state vp = state->vp;
2638 
2639    for (unsigned i = 0; i < vp.scissor_count; i++) {
2640       VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
2641       VkRect2D scissor =
2642          state->share_scale ? state->vp.scissors[i] : state->vp.scissors[0];
2643       if (frag_area.width == 1 && frag_area.height == 1) {
2644          vp.scissors[i] = scissor;
2645          continue;
2646       }
2647 
2648       /* Transform the scissor following the viewport. It's unclear how this
2649        * is supposed to handle cases where the scissor isn't aligned to the
2650        * fragment area, but we round outwards to always render partial
2651        * fragments if the scissor size equals the framebuffer size and it
2652        * isn't aligned to the fragment area.
2653        */
2654       VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
2655       VkOffset2D min = {
2656          scissor.offset.x / frag_area.width + offset.x,
2657          scissor.offset.y / frag_area.width + offset.y,
2658       };
2659       VkOffset2D max = {
2660          DIV_ROUND_UP(scissor.offset.x + scissor.extent.width, frag_area.width) + offset.x,
2661          DIV_ROUND_UP(scissor.offset.y + scissor.extent.height, frag_area.height) + offset.y,
2662       };
2663 
2664       /* Intersect scissor with the scaled bin, this essentially replaces the
2665        * window scissor.
2666        */
2667       uint32_t scaled_width = bin.extent.width / frag_area.width;
2668       uint32_t scaled_height = bin.extent.height / frag_area.height;
2669       vp.scissors[i].offset.x = MAX2(min.x, bin.offset.x);
2670       vp.scissors[i].offset.y = MAX2(min.y, bin.offset.y);
2671       vp.scissors[i].extent.width =
2672          MIN2(max.x, bin.offset.x + scaled_width) - vp.scissors[i].offset.x;
2673       vp.scissors[i].extent.height =
2674          MIN2(max.y, bin.offset.y + scaled_height) - vp.scissors[i].offset.y;
2675    }
2676 
2677    TU_CALLX(cs->device, tu6_emit_scissor)(cs, &vp);
2678 }
2679 
2680 static void
tu6_emit_scissor_fdm(struct tu_cs * cs,struct tu_cmd_buffer * cmd,const struct vk_viewport_state * vp)2681 tu6_emit_scissor_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
2682                      const struct vk_viewport_state *vp)
2683 {
2684    unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
2685    struct apply_viewport_state state = {
2686       .vp = *vp,
2687       .share_scale = !cmd->state.per_view_viewport,
2688    };
2689    if (!state.share_scale)
2690       state.vp.scissor_count = num_views;
2691    unsigned size = TU_CALLX(cmd->device, tu6_scissor_size)(cmd->device, &state.vp);
2692    tu_cs_begin_sub_stream(&cmd->sub_cs, size, cs);
2693    tu_create_fdm_bin_patchpoint(cmd, cs, size, fdm_apply_scissors, state);
2694 }
2695 
2696 static const enum mesa_vk_dynamic_graphics_state tu_sample_locations_state[] = {
2697    MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE,
2698    MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS,
2699 };
2700 
2701 template <chip CHIP>
2702 static unsigned
tu6_sample_locations_size(struct tu_device * dev,bool enable,const struct vk_sample_locations_state * samp_loc)2703 tu6_sample_locations_size(struct tu_device *dev, bool enable,
2704                           const struct vk_sample_locations_state *samp_loc)
2705 {
2706    return 6 + (enable ? 6 : 0);
2707 }
2708 
2709 template <chip CHIP>
2710 void
tu6_emit_sample_locations(struct tu_cs * cs,bool enable,const struct vk_sample_locations_state * samp_loc)2711 tu6_emit_sample_locations(struct tu_cs *cs, bool enable,
2712                           const struct vk_sample_locations_state *samp_loc)
2713 {
2714    uint32_t sample_config =
2715       COND(enable, A6XX_RB_SAMPLE_CONFIG_LOCATION_ENABLE);
2716 
2717    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 1);
2718    tu_cs_emit(cs, sample_config);
2719 
2720    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 1);
2721    tu_cs_emit(cs, sample_config);
2722 
2723    tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 1);
2724    tu_cs_emit(cs, sample_config);
2725 
2726    if (!enable)
2727       return;
2728 
2729    assert(samp_loc->grid_size.width == 1);
2730    assert(samp_loc->grid_size.height == 1);
2731 
2732    uint32_t sample_locations = 0;
2733    for (uint32_t i = 0; i < samp_loc->per_pixel; i++) {
2734       /* From VkSampleLocationEXT:
2735        *
2736        *    The values specified in a VkSampleLocationEXT structure are always
2737        *    clamped to the implementation-dependent sample location coordinate
2738        *    range
2739        *    [sampleLocationCoordinateRange[0],sampleLocationCoordinateRange[1]]
2740        */
2741       float x = CLAMP(samp_loc->locations[i].x, SAMPLE_LOCATION_MIN,
2742                       SAMPLE_LOCATION_MAX);
2743       float y = CLAMP(samp_loc->locations[i].y, SAMPLE_LOCATION_MIN,
2744                       SAMPLE_LOCATION_MAX);
2745 
2746       sample_locations |=
2747          (A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(x) |
2748           A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(y)) << i*8;
2749    }
2750 
2751    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_LOCATION_0, 1);
2752    tu_cs_emit(cs, sample_locations);
2753 
2754    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_LOCATION_0, 1);
2755    tu_cs_emit(cs, sample_locations);
2756 
2757    tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_LOCATION_0, 1);
2758    tu_cs_emit(cs, sample_locations);
2759 }
2760 
2761 static const enum mesa_vk_dynamic_graphics_state tu_depth_bias_state[] = {
2762    MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS,
2763 };
2764 
2765 template <chip CHIP>
2766 static unsigned
tu6_depth_bias_size(struct tu_device * dev,const struct vk_rasterization_state * rs)2767 tu6_depth_bias_size(struct tu_device *dev,
2768                     const struct vk_rasterization_state *rs)
2769 {
2770    return 4;
2771 }
2772 
2773 template <chip CHIP>
2774 void
tu6_emit_depth_bias(struct tu_cs * cs,const struct vk_rasterization_state * rs)2775 tu6_emit_depth_bias(struct tu_cs *cs, const struct vk_rasterization_state *rs)
2776 {
2777    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3);
2778    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(rs->depth_bias.slope).value);
2779    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(rs->depth_bias.constant).value);
2780    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(rs->depth_bias.clamp).value);
2781 }
2782 
2783 static const enum mesa_vk_dynamic_graphics_state tu_bandwidth_state[] = {
2784    MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
2785    MESA_VK_DYNAMIC_CB_LOGIC_OP,
2786    MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
2787    MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
2788    MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
2789    MESA_VK_DYNAMIC_CB_WRITE_MASKS,
2790 };
2791 
2792 static void
tu_calc_bandwidth(struct tu_bandwidth * bandwidth,const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)2793 tu_calc_bandwidth(struct tu_bandwidth *bandwidth,
2794                   const struct vk_color_blend_state *cb,
2795                   const struct vk_render_pass_state *rp)
2796 {
2797    bool rop_reads_dst = cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op);
2798 
2799    uint32_t total_bpp = 0;
2800    for (unsigned i = 0; i < cb->attachment_count; i++) {
2801       const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
2802       if (!(cb->color_write_enables & (1u << i)))
2803          continue;
2804 
2805       const VkFormat format = rp->color_attachment_formats[i];
2806 
2807       uint32_t write_bpp = 0;
2808       if (format == VK_FORMAT_UNDEFINED) {
2809          /* do nothing */
2810       } else if (att->write_mask == 0xf) {
2811          write_bpp = vk_format_get_blocksizebits(format);
2812       } else {
2813          const enum pipe_format pipe_format = vk_format_to_pipe_format(format);
2814          for (uint32_t i = 0; i < 4; i++) {
2815             if (att->write_mask & (1 << i)) {
2816                write_bpp += util_format_get_component_bits(pipe_format,
2817                      UTIL_FORMAT_COLORSPACE_RGB, i);
2818             }
2819          }
2820       }
2821       total_bpp += write_bpp;
2822 
2823       if (rop_reads_dst || att->blend_enable) {
2824          total_bpp += write_bpp;
2825       }
2826    }
2827 
2828    bandwidth->color_bandwidth_per_sample = total_bpp / 8;
2829 
2830    if (rp->attachments & MESA_VK_RP_ATTACHMENT_DEPTH_BIT) {
2831       bandwidth->depth_cpp_per_sample = util_format_get_component_bits(
2832             vk_format_to_pipe_format(rp->depth_attachment_format),
2833             UTIL_FORMAT_COLORSPACE_ZS, 0) / 8;
2834    }
2835 
2836    if (rp->attachments & MESA_VK_RP_ATTACHMENT_STENCIL_BIT) {
2837       bandwidth->stencil_cpp_per_sample = util_format_get_component_bits(
2838             vk_format_to_pipe_format(rp->stencil_attachment_format),
2839             UTIL_FORMAT_COLORSPACE_ZS, 1) / 8;
2840    }
2841 }
2842 
2843 /* Return true if the blend state reads the color attachments. */
2844 static bool
tu6_calc_blend_lrz(const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)2845 tu6_calc_blend_lrz(const struct vk_color_blend_state *cb,
2846                    const struct vk_render_pass_state *rp)
2847 {
2848    if (cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op))
2849       return true;
2850 
2851    for (unsigned i = 0; i < cb->attachment_count; i++) {
2852       if (rp->color_attachment_formats[i] == VK_FORMAT_UNDEFINED)
2853          continue;
2854 
2855       const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
2856       if (att->blend_enable)
2857          return true;
2858       if (!(cb->color_write_enables & (1u << i)))
2859          return true;
2860       unsigned mask =
2861          MASK(vk_format_get_nr_components(rp->color_attachment_formats[i]));
2862       if ((att->write_mask & mask) != mask)
2863          return true;
2864    }
2865 
2866    return false;
2867 }
2868 
2869 static const enum mesa_vk_dynamic_graphics_state tu_blend_lrz_state[] = {
2870    MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
2871    MESA_VK_DYNAMIC_CB_LOGIC_OP,
2872    MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
2873    MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
2874    MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
2875    MESA_VK_DYNAMIC_CB_WRITE_MASKS,
2876 };
2877 
2878 static void
tu_emit_blend_lrz(struct tu_lrz_blend * lrz,const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)2879 tu_emit_blend_lrz(struct tu_lrz_blend *lrz,
2880                   const struct vk_color_blend_state *cb,
2881                   const struct vk_render_pass_state *rp)
2882 {
2883    lrz->reads_dest = tu6_calc_blend_lrz(cb, rp);
2884    lrz->valid = true;
2885 }
2886 
2887 static const enum mesa_vk_dynamic_graphics_state tu_blend_state[] = {
2888    MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
2889    MESA_VK_DYNAMIC_CB_LOGIC_OP,
2890    MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
2891    MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
2892    MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
2893    MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS,
2894    MESA_VK_DYNAMIC_CB_WRITE_MASKS,
2895    MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE,
2896    MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE,
2897    MESA_VK_DYNAMIC_MS_SAMPLE_MASK,
2898 };
2899 
2900 template <chip CHIP>
2901 static unsigned
tu6_blend_size(struct tu_device * dev,const struct vk_color_blend_state * cb,bool alpha_to_coverage_enable,bool alpha_to_one_enable,uint32_t sample_mask)2902 tu6_blend_size(struct tu_device *dev,
2903                const struct vk_color_blend_state *cb,
2904                bool alpha_to_coverage_enable,
2905                bool alpha_to_one_enable,
2906                uint32_t sample_mask)
2907 {
2908    unsigned num_rts = alpha_to_coverage_enable ?
2909       MAX2(cb->attachment_count, 1) : cb->attachment_count;
2910    return 8 + 3 * num_rts;
2911 }
2912 
2913 template <chip CHIP>
2914 static void
tu6_emit_blend(struct tu_cs * cs,const struct vk_color_blend_state * cb,bool alpha_to_coverage_enable,bool alpha_to_one_enable,uint32_t sample_mask)2915 tu6_emit_blend(struct tu_cs *cs,
2916                const struct vk_color_blend_state *cb,
2917                bool alpha_to_coverage_enable,
2918                bool alpha_to_one_enable,
2919                uint32_t sample_mask)
2920 {
2921    bool rop_reads_dst = cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op);
2922    enum a3xx_rop_code rop = tu6_rop((VkLogicOp)cb->logic_op);
2923 
2924    uint32_t blend_enable_mask = 0;
2925    for (unsigned i = 0; i < cb->attachment_count; i++) {
2926       const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
2927       if (!(cb->color_write_enables & (1u << i)))
2928          continue;
2929 
2930       if (rop_reads_dst || att->blend_enable) {
2931          blend_enable_mask |= 1u << i;
2932       }
2933    }
2934 
2935    /* This will emit a dummy RB_MRT_*_CONTROL below if alpha-to-coverage is
2936     * enabled but there are no color attachments, in addition to changing
2937     * *_FS_OUTPUT_CNTL1.
2938     */
2939    unsigned num_rts = alpha_to_coverage_enable ?
2940       MAX2(cb->attachment_count, 1) : cb->attachment_count;
2941 
2942    bool dual_src_blend = tu_blend_state_is_dual_src(cb);
2943 
2944    tu_cs_emit_regs(cs, A6XX_SP_FS_OUTPUT_CNTL1(.mrt = num_rts));
2945    tu_cs_emit_regs(cs, A6XX_RB_FS_OUTPUT_CNTL1(.mrt = num_rts));
2946    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask,
2947                                           .unk8 = true,
2948                                           .dual_color_in_enable =
2949                                              dual_src_blend,
2950                                           .alpha_to_coverage =
2951                                              alpha_to_coverage_enable));
2952    /* set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled? */
2953    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.enable_blend = blend_enable_mask,
2954                                           .independent_blend = true,
2955                                           .dual_color_in_enable =
2956                                              dual_src_blend,
2957                                           .alpha_to_coverage =
2958                                              alpha_to_coverage_enable,
2959                                           .alpha_to_one = alpha_to_one_enable,
2960                                           .sample_mask = sample_mask));
2961 
2962    for (unsigned i = 0; i < num_rts; i++) {
2963       const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
2964       if ((cb->color_write_enables & (1u << i)) && i < cb->attachment_count) {
2965          const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->color_blend_op);
2966          const enum adreno_rb_blend_factor src_color_factor =
2967             tu6_blend_factor((VkBlendFactor)att->src_color_blend_factor);
2968          const enum adreno_rb_blend_factor dst_color_factor =
2969             tu6_blend_factor((VkBlendFactor)att->dst_color_blend_factor);
2970          const enum a3xx_rb_blend_opcode alpha_op =
2971             tu6_blend_op(att->alpha_blend_op);
2972          const enum adreno_rb_blend_factor src_alpha_factor =
2973             tu6_blend_factor((VkBlendFactor)att->src_alpha_blend_factor);
2974          const enum adreno_rb_blend_factor dst_alpha_factor =
2975             tu6_blend_factor((VkBlendFactor)att->dst_alpha_blend_factor);
2976 
2977          tu_cs_emit_regs(cs,
2978                          A6XX_RB_MRT_CONTROL(i,
2979                                              .blend = att->blend_enable,
2980                                              .blend2 = att->blend_enable,
2981                                              .rop_enable = cb->logic_op_enable,
2982                                              .rop_code = rop,
2983                                              .component_enable = att->write_mask),
2984                          A6XX_RB_MRT_BLEND_CONTROL(i,
2985                                                    .rgb_src_factor = src_color_factor,
2986                                                    .rgb_blend_opcode = color_op,
2987                                                    .rgb_dest_factor = dst_color_factor,
2988                                                    .alpha_src_factor = src_alpha_factor,
2989                                                    .alpha_blend_opcode = alpha_op,
2990                                                    .alpha_dest_factor = dst_alpha_factor));
2991       } else {
2992             tu_cs_emit_regs(cs,
2993                             A6XX_RB_MRT_CONTROL(i,),
2994                             A6XX_RB_MRT_BLEND_CONTROL(i,));
2995       }
2996    }
2997 }
2998 
2999 static const enum mesa_vk_dynamic_graphics_state tu_blend_constants_state[] = {
3000    MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS,
3001 };
3002 
3003 template <chip CHIP>
3004 static unsigned
tu6_blend_constants_size(struct tu_device * dev,const struct vk_color_blend_state * cb)3005 tu6_blend_constants_size(struct tu_device *dev,
3006                          const struct vk_color_blend_state *cb)
3007 {
3008    return 5;
3009 }
3010 
3011 template <chip CHIP>
3012 static void
tu6_emit_blend_constants(struct tu_cs * cs,const struct vk_color_blend_state * cb)3013 tu6_emit_blend_constants(struct tu_cs *cs, const struct vk_color_blend_state *cb)
3014 {
3015    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLEND_RED_F32, 4);
3016    tu_cs_emit_array(cs, (const uint32_t *) cb->blend_constants, 4);
3017 }
3018 
3019 static const enum mesa_vk_dynamic_graphics_state tu_rast_state[] = {
3020    MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
3021    MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE,
3022    MESA_VK_DYNAMIC_RS_POLYGON_MODE,
3023    MESA_VK_DYNAMIC_RS_CULL_MODE,
3024    MESA_VK_DYNAMIC_RS_FRONT_FACE,
3025    MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE,
3026    MESA_VK_DYNAMIC_RS_LINE_MODE,
3027    MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE,
3028    MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM,
3029    MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE,
3030    MESA_VK_DYNAMIC_RS_LINE_WIDTH,
3031 };
3032 
3033 template <chip CHIP>
3034 uint32_t
tu6_rast_size(struct tu_device * dev,const struct vk_rasterization_state * rs,const struct vk_viewport_state * vp,bool multiview,bool per_view_viewport)3035 tu6_rast_size(struct tu_device *dev,
3036               const struct vk_rasterization_state *rs,
3037               const struct vk_viewport_state *vp,
3038               bool multiview,
3039               bool per_view_viewport)
3040 {
3041    if (CHIP == A6XX) {
3042       return 15 + (dev->physical_device->info->a6xx.has_shading_rate ? 8 : 0);
3043    } else {
3044       return 17;
3045    }
3046 }
3047 
3048 template <chip CHIP>
3049 void
tu6_emit_rast(struct tu_cs * cs,const struct vk_rasterization_state * rs,const struct vk_viewport_state * vp,bool multiview,bool per_view_viewport)3050 tu6_emit_rast(struct tu_cs *cs,
3051               const struct vk_rasterization_state *rs,
3052               const struct vk_viewport_state *vp,
3053               bool multiview,
3054               bool per_view_viewport)
3055 {
3056    enum a5xx_line_mode line_mode =
3057       rs->line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR ?
3058       BRESENHAM : RECTANGULAR;
3059    tu_cs_emit_regs(cs,
3060                    A6XX_GRAS_SU_CNTL(
3061                      .cull_front = rs->cull_mode & VK_CULL_MODE_FRONT_BIT,
3062                      .cull_back = rs->cull_mode & VK_CULL_MODE_BACK_BIT,
3063                      .front_cw = rs->front_face == VK_FRONT_FACE_CLOCKWISE,
3064                      .linehalfwidth = rs->line.width / 2.0f,
3065                      .poly_offset = rs->depth_bias.enable,
3066                      .line_mode = line_mode,
3067                      .multiview_enable = multiview,
3068                      .rendertargetindexincr = multiview,
3069                      .viewportindexincr = multiview && per_view_viewport));
3070 
3071    bool depth_clip_enable = vk_rasterization_state_depth_clip_enable(rs);
3072 
3073    tu_cs_emit_regs(cs,
3074                    A6XX_GRAS_CL_CNTL(
3075                      .znear_clip_disable = !depth_clip_enable,
3076                      .zfar_clip_disable = !depth_clip_enable,
3077                      /* To support VK_EXT_depth_clamp_zero_one on a7xx+ */
3078                      .z_clamp_enable = rs->depth_clamp_enable || CHIP >= A7XX,
3079                      .zero_gb_scale_z = vp->depth_clip_negative_one_to_one ? 0 : 1,
3080                      .vp_clip_code_ignore = 1));;
3081 
3082    enum a6xx_polygon_mode polygon_mode = tu6_polygon_mode(rs->polygon_mode);
3083 
3084    tu_cs_emit_regs(cs,
3085                    A6XX_VPC_POLYGON_MODE(polygon_mode));
3086 
3087    tu_cs_emit_regs(cs,
3088                    PC_POLYGON_MODE(CHIP, polygon_mode));
3089 
3090    if (CHIP == A7XX) {
3091       tu_cs_emit_regs(cs,
3092                      A7XX_VPC_POLYGON_MODE2(polygon_mode));
3093    }
3094 
3095    tu_cs_emit_regs(cs, PC_RASTER_CNTL(CHIP,
3096       .stream = rs->rasterization_stream,
3097       .discard = rs->rasterizer_discard_enable));
3098    if (CHIP == A6XX) {
3099       tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107(
3100          .raster_discard = rs->rasterizer_discard_enable));
3101    } else {
3102       tu_cs_emit_regs(cs, A7XX_PC_RASTER_CNTL_V2(
3103          .stream = rs->rasterization_stream,
3104          .discard = rs->rasterizer_discard_enable));
3105    }
3106 
3107    /* move to hw ctx init? */
3108    tu_cs_emit_regs(cs,
3109                    A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f),
3110                    A6XX_GRAS_SU_POINT_SIZE(1.0f));
3111 
3112    if (CHIP == A6XX && cs->device->physical_device->info->a6xx.has_shading_rate) {
3113       tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A00());
3114       tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A10());
3115       tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A20());
3116       tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A30());
3117    }
3118 }
3119 
3120 static const enum mesa_vk_dynamic_graphics_state tu_ds_state[] = {
3121    MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE,
3122    MESA_VK_DYNAMIC_DS_STENCIL_OP,
3123    MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK,
3124    MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK,
3125    MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE,
3126    MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS,
3127 };
3128 
3129 template <chip CHIP>
3130 static unsigned
tu6_ds_size(struct tu_device * dev,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp)3131 tu6_ds_size(struct tu_device *dev,
3132                  const struct vk_depth_stencil_state *ds,
3133                  const struct vk_render_pass_state *rp)
3134 {
3135    return 13;
3136 }
3137 
3138 template <chip CHIP>
3139 static void
tu6_emit_ds(struct tu_cs * cs,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp)3140 tu6_emit_ds(struct tu_cs *cs,
3141             const struct vk_depth_stencil_state *ds,
3142             const struct vk_render_pass_state *rp)
3143 {
3144    bool stencil_test_enable =
3145       ds->stencil.test_enable && rp->attachments & MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
3146    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
3147       .stencil_enable = stencil_test_enable,
3148       .stencil_enable_bf = stencil_test_enable,
3149       .stencil_read = stencil_test_enable,
3150       .func = tu6_compare_func((VkCompareOp)ds->stencil.front.op.compare),
3151       .fail = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.fail),
3152       .zpass = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.pass),
3153       .zfail = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.depth_fail),
3154       .func_bf = tu6_compare_func((VkCompareOp)ds->stencil.back.op.compare),
3155       .fail_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.fail),
3156       .zpass_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.pass),
3157       .zfail_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.depth_fail)));
3158    tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL(stencil_test_enable));
3159 
3160    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(
3161       .mask = ds->stencil.front.compare_mask,
3162       .bfmask = ds->stencil.back.compare_mask));
3163 
3164    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(
3165       .wrmask = ds->stencil.front.write_mask,
3166       .bfwrmask = ds->stencil.back.write_mask));
3167 
3168    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(
3169       .ref = ds->stencil.front.reference,
3170       .bfref = ds->stencil.back.reference));
3171 
3172    tu_cs_emit_regs(cs,
3173                    A6XX_RB_Z_BOUNDS_MIN(ds->depth.bounds_test.min),
3174                    A6XX_RB_Z_BOUNDS_MAX(ds->depth.bounds_test.max));
3175 }
3176 
3177 static const enum mesa_vk_dynamic_graphics_state tu_rb_depth_cntl_state[] = {
3178    MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE,
3179    MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE,
3180    MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP,
3181    MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE,
3182    MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
3183 };
3184 
3185 template <chip CHIP>
3186 static unsigned
tu6_rb_depth_cntl_size(struct tu_device * dev,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp,const struct vk_rasterization_state * rs)3187 tu6_rb_depth_cntl_size(struct tu_device *dev,
3188                        const struct vk_depth_stencil_state *ds,
3189                        const struct vk_render_pass_state *rp,
3190                        const struct vk_rasterization_state *rs)
3191 {
3192    return 4;
3193 }
3194 
3195 template <chip CHIP>
3196 static void
tu6_emit_rb_depth_cntl(struct tu_cs * cs,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp,const struct vk_rasterization_state * rs)3197 tu6_emit_rb_depth_cntl(struct tu_cs *cs,
3198                        const struct vk_depth_stencil_state *ds,
3199                        const struct vk_render_pass_state *rp,
3200                        const struct vk_rasterization_state *rs)
3201 {
3202    if (rp->attachments & MESA_VK_RP_ATTACHMENT_DEPTH_BIT) {
3203       bool depth_test = ds->depth.test_enable;
3204       enum adreno_compare_func zfunc = tu6_compare_func(ds->depth.compare_op);
3205 
3206       /* On some GPUs it is necessary to enable z test for depth bounds test
3207        * when UBWC is enabled. Otherwise, the GPU would hang. FUNC_ALWAYS is
3208        * required to pass z test. Relevant tests:
3209        *  dEQP-VK.pipeline.extended_dynamic_state.two_draws_dynamic.depth_bounds_test_disable
3210        *  dEQP-VK.dynamic_state.ds_state.depth_bounds_1
3211        */
3212       if (ds->depth.bounds_test.enable &&
3213           !ds->depth.test_enable &&
3214           cs->device->physical_device->info->a6xx.depth_bounds_require_depth_test_quirk) {
3215          depth_test = true;
3216          zfunc = FUNC_ALWAYS;
3217       }
3218 
3219       tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
3220          .z_test_enable = depth_test,
3221          .z_write_enable = ds->depth.test_enable && ds->depth.write_enable,
3222          .zfunc = zfunc,
3223          /* To support VK_EXT_depth_clamp_zero_one on a7xx+ */
3224          .z_clamp_enable = rs->depth_clamp_enable || CHIP >= A7XX,
3225          /* TODO don't set for ALWAYS/NEVER */
3226          .z_read_enable = ds->depth.test_enable || ds->depth.bounds_test.enable,
3227          .z_bounds_enable = ds->depth.bounds_test.enable));
3228       tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL(depth_test));
3229    } else {
3230       tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
3231       tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL());
3232    }
3233 }
3234 
3235 static const enum mesa_vk_dynamic_graphics_state tu_prim_mode_sysmem_state[] = {
3236    MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE,
3237 };
3238 
3239 template <chip CHIP>
3240 static unsigned
tu6_prim_mode_sysmem_size(struct tu_device * dev,bool raster_order_attachment_access,VkImageAspectFlags feedback_loops,bool * sysmem_single_prim_mode)3241 tu6_prim_mode_sysmem_size(struct tu_device *dev,
3242                           bool raster_order_attachment_access,
3243                           VkImageAspectFlags feedback_loops,
3244                           bool *sysmem_single_prim_mode)
3245 {
3246    return 2;
3247 }
3248 
3249 template <chip CHIP>
3250 static void
tu6_emit_prim_mode_sysmem(struct tu_cs * cs,bool raster_order_attachment_access,VkImageAspectFlags feedback_loops,bool * sysmem_single_prim_mode)3251 tu6_emit_prim_mode_sysmem(struct tu_cs *cs,
3252                           bool raster_order_attachment_access,
3253                           VkImageAspectFlags feedback_loops,
3254                           bool *sysmem_single_prim_mode)
3255 {
3256    /* VK_EXT_rasterization_order_attachment_access:
3257     *
3258     * This extension allow access to framebuffer attachments when used as both
3259     * input and color attachments from one fragment to the next, in
3260     * rasterization order, without explicit synchronization.
3261     */
3262    raster_order_attachment_access |= TU_DEBUG(RAST_ORDER);
3263 
3264    /* If there is a feedback loop, then the shader can read the previous value
3265     * of a pixel being written out. It can also write some components and then
3266     * read different components without a barrier in between. This is a
3267     * problem in sysmem mode with UBWC, because the main buffer and flags
3268     * buffer can get out-of-sync if only one is flushed. We fix this by
3269     * setting the SINGLE_PRIM_MODE field to the same value that the blob does
3270     * for advanced_blend in sysmem mode if a feedback loop is detected.
3271     */
3272    enum a6xx_single_prim_mode sysmem_prim_mode =
3273       (raster_order_attachment_access || feedback_loops) ?
3274       FLUSH_PER_OVERLAP_AND_OVERWRITE : NO_FLUSH;
3275 
3276    if (sysmem_prim_mode == FLUSH_PER_OVERLAP_AND_OVERWRITE)
3277       *sysmem_single_prim_mode = true;
3278 
3279    tu_cs_emit_regs(cs, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2,
3280                                          .single_prim_mode = sysmem_prim_mode));
3281 }
3282 
3283 static inline bool
emit_pipeline_state(BITSET_WORD * keep,BITSET_WORD * remove,BITSET_WORD * pipeline_set,const enum mesa_vk_dynamic_graphics_state * state_array,unsigned num_states,bool extra_cond,struct tu_pipeline_builder * builder)3284 emit_pipeline_state(BITSET_WORD *keep, BITSET_WORD *remove,
3285                     BITSET_WORD *pipeline_set,
3286                     const enum mesa_vk_dynamic_graphics_state *state_array,
3287                     unsigned num_states, bool extra_cond,
3288                     struct tu_pipeline_builder *builder)
3289 {
3290    BITSET_DECLARE(state, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3291 
3292    /* Unrolling this loop should produce a constant value once the function is
3293     * inlined, because state_array and num_states are a per-draw-state
3294     * constant, but GCC seems to need a little encouragement. clang does a
3295     * little better but still needs a pragma when there are a large number of
3296     * states.
3297     */
3298 #if defined(__clang__)
3299 #pragma clang loop unroll(full)
3300 #elif defined(__GNUC__) && __GNUC__ >= 8
3301 #pragma GCC unroll MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX
3302 #endif
3303    for (unsigned i = 0; i < num_states; i++) {
3304       BITSET_SET(state, state_array[i]);
3305    }
3306 
3307    /* If all of the state is set, then after we emit it we can tentatively
3308     * remove it from the states to set for the pipeline by making it dynamic.
3309     * If we can't emit it, though, we need to keep around the partial state so
3310     * that we can emit it later, even if another draw state consumes it. That
3311     * is, we have to cancel any tentative removal.
3312     */
3313    BITSET_DECLARE(temp, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX);
3314    memcpy(temp, pipeline_set, sizeof(temp));
3315    BITSET_AND(temp, temp, state);
3316    if (!BITSET_EQUAL(temp, state) || !extra_cond) {
3317       __bitset_or(keep, keep, temp, ARRAY_SIZE(temp));
3318       return false;
3319    }
3320    __bitset_or(remove, remove, state, ARRAY_SIZE(state));
3321    return true;
3322 }
3323 
3324 template <chip CHIP>
3325 static void
tu_pipeline_builder_emit_state(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3326 tu_pipeline_builder_emit_state(struct tu_pipeline_builder *builder,
3327                                struct tu_pipeline *pipeline)
3328 {
3329    struct tu_cs cs;
3330    BITSET_DECLARE(keep, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3331    BITSET_DECLARE(remove, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3332    BITSET_DECLARE(pipeline_set, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3333 
3334    vk_graphics_pipeline_get_state(&builder->graphics_state, pipeline_set);
3335 
3336 #define EMIT_STATE(name, extra_cond)                                          \
3337    emit_pipeline_state(keep, remove, pipeline_set, tu_##name##_state,         \
3338                        ARRAY_SIZE(tu_##name##_state), extra_cond, builder)
3339 
3340 #define DRAW_STATE_COND(name, id, extra_cond, ...)                            \
3341    if (EMIT_STATE(name, extra_cond)) {                                        \
3342       unsigned size = tu6_##name##_size<CHIP>(builder->device, __VA_ARGS__);  \
3343       if (size > 0) {                                                         \
3344          tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);                    \
3345          tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                             \
3346          pipeline->dynamic_state[id] =                                        \
3347             tu_cs_end_draw_state(&pipeline->cs, &cs);                         \
3348       }                                                                       \
3349       pipeline->set_state_mask |= (1u << id);                                 \
3350    }
3351 #define DRAW_STATE(name, id, ...) DRAW_STATE_COND(name, id, true, __VA_ARGS__)
3352 
3353    DRAW_STATE(vertex_input, TU_DYNAMIC_STATE_VERTEX_INPUT,
3354               builder->graphics_state.vi);
3355    DRAW_STATE(vertex_stride, TU_DYNAMIC_STATE_VB_STRIDE,
3356               builder->graphics_state.vi);
3357    /* If (a) per-view viewport is used or (b) we don't know yet, then we need
3358     * to set viewport and stencil state dynamically.
3359     */
3360    bool no_per_view_viewport = pipeline_contains_all_shader_state(pipeline) &&
3361       !pipeline->program.per_view_viewport;
3362    DRAW_STATE_COND(viewport, TU_DYNAMIC_STATE_VIEWPORT, no_per_view_viewport,
3363                    builder->graphics_state.vp,
3364                    builder->graphics_state.rs);
3365    DRAW_STATE_COND(scissor, TU_DYNAMIC_STATE_SCISSOR, no_per_view_viewport,
3366               builder->graphics_state.vp);
3367    DRAW_STATE(sample_locations,
3368               TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
3369               builder->graphics_state.ms->sample_locations_enable,
3370               builder->graphics_state.ms->sample_locations);
3371    DRAW_STATE(depth_bias, TU_DYNAMIC_STATE_DEPTH_BIAS,
3372               builder->graphics_state.rs);
3373    bool attachments_valid =
3374       builder->graphics_state.rp &&
3375       vk_render_pass_state_has_attachment_info(builder->graphics_state.rp);
3376    struct vk_color_blend_state dummy_cb = {};
3377    const struct vk_color_blend_state *cb = builder->graphics_state.cb;
3378    if (attachments_valid &&
3379        !(builder->graphics_state.rp->attachments &
3380          MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)) {
3381       /* If there are no color attachments, then the original blend state may
3382        * be NULL and the common code sanitizes it to always be NULL. In this
3383        * case we want to emit an empty blend/bandwidth/etc.  rather than
3384        * letting it be dynamic (and potentially garbage).
3385        */
3386       cb = &dummy_cb;
3387       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE);
3388       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_LOGIC_OP);
3389       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT);
3390       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
3391       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
3392       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
3393       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
3394       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
3395    }
3396    DRAW_STATE(blend, TU_DYNAMIC_STATE_BLEND, cb,
3397               builder->graphics_state.ms->alpha_to_coverage_enable,
3398               builder->graphics_state.ms->alpha_to_one_enable,
3399               builder->graphics_state.ms->sample_mask);
3400    if (EMIT_STATE(blend_lrz, attachments_valid))
3401       tu_emit_blend_lrz(&pipeline->lrz_blend, cb,
3402                         builder->graphics_state.rp);
3403    if (EMIT_STATE(bandwidth, attachments_valid))
3404       tu_calc_bandwidth(&pipeline->bandwidth, cb,
3405                         builder->graphics_state.rp);
3406    DRAW_STATE(blend_constants, TU_DYNAMIC_STATE_BLEND_CONSTANTS, cb);
3407    if (attachments_valid &&
3408        !(builder->graphics_state.rp->attachments &
3409          MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)) {
3410       /* Don't actually make anything dynamic as that may mean a partially-set
3411        * state group where the group is NULL which angers common code.
3412        */
3413       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE);
3414       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_LOGIC_OP);
3415       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT);
3416       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
3417       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
3418       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
3419       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
3420       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
3421    }
3422    DRAW_STATE_COND(rast, TU_DYNAMIC_STATE_RAST,
3423                    pipeline_contains_all_shader_state(pipeline),
3424                    builder->graphics_state.rs,
3425                    builder->graphics_state.vp,
3426                    builder->graphics_state.rp->view_mask != 0,
3427                    pipeline->program.per_view_viewport);
3428    DRAW_STATE_COND(ds, TU_DYNAMIC_STATE_DS,
3429               attachments_valid,
3430               builder->graphics_state.ds,
3431               builder->graphics_state.rp);
3432    DRAW_STATE_COND(rb_depth_cntl, TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
3433                    attachments_valid,
3434                    builder->graphics_state.ds,
3435                    builder->graphics_state.rp,
3436                    builder->graphics_state.rs);
3437    DRAW_STATE_COND(patch_control_points,
3438                    TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS,
3439                    pipeline_contains_all_shader_state(pipeline),
3440                    pipeline->shaders[MESA_SHADER_VERTEX],
3441                    pipeline->shaders[MESA_SHADER_TESS_CTRL],
3442                    pipeline->shaders[MESA_SHADER_TESS_EVAL],
3443                    &pipeline->program,
3444                    builder->graphics_state.ts->patch_control_points);
3445    bool has_raster_order_state = false;
3446    if (pipeline->type == TU_PIPELINE_GRAPHICS) {
3447       has_raster_order_state = true;
3448    } else {
3449       struct tu_graphics_lib_pipeline *lib =
3450          tu_pipeline_to_graphics_lib(pipeline);
3451       has_raster_order_state =
3452          (lib->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
3453          (lib->state &
3454           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT);
3455    }
3456    if (!builder->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) {
3457       DRAW_STATE_COND(prim_mode_sysmem,
3458                       TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM,
3459                       has_raster_order_state,
3460                       pipeline->output.raster_order_attachment_access ||
3461                       pipeline->ds.raster_order_attachment_access,
3462                       vk_pipeline_flags_feedback_loops(builder->graphics_state.pipeline_flags),
3463                       &pipeline->prim_order.sysmem_single_prim_mode);
3464    }
3465 #undef DRAW_STATE
3466 #undef DRAW_STATE_COND
3467 #undef EMIT_STATE
3468 
3469    /* LRZ always needs depth/stencil state at draw time */
3470    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
3471    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
3472    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
3473    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP);
3474    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
3475    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_OP);
3476    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK);
3477    BITSET_SET(keep, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE);
3478 
3479    /* MSAA needs line mode */
3480    BITSET_SET(keep, MESA_VK_DYNAMIC_RS_LINE_MODE);
3481 
3482    /* The patch control points is part of the draw */
3483    BITSET_SET(keep, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS);
3484 
3485    /* Vertex buffer state needs to know the max valid binding */
3486    BITSET_SET(keep, MESA_VK_DYNAMIC_VI_BINDINGS_VALID);
3487 
3488    /* Remove state which has been emitted and we no longer need to set when
3489     * binding the pipeline by making it "dynamic".
3490     */
3491    BITSET_ANDNOT(remove, remove, keep);
3492 
3493    BITSET_OR(pipeline->static_state_mask, pipeline->static_state_mask, remove);
3494 
3495    BITSET_OR(builder->graphics_state.dynamic, builder->graphics_state.dynamic,
3496              remove);
3497 }
3498 
3499 static inline bool
emit_draw_state(const struct vk_dynamic_graphics_state * dynamic_state,const enum mesa_vk_dynamic_graphics_state * state_array,unsigned num_states)3500 emit_draw_state(const struct vk_dynamic_graphics_state *dynamic_state,
3501                 const enum mesa_vk_dynamic_graphics_state *state_array,
3502                 unsigned num_states)
3503 {
3504    BITSET_DECLARE(state, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3505 
3506    /* Unrolling this loop should produce a constant value once the function is
3507     * inlined, because state_array and num_states are a per-draw-state
3508     * constant, but GCC seems to need a little encouragement. clang does a
3509     * little better but still needs a pragma when there are a large number of
3510     * states.
3511     */
3512 #if defined(__clang__)
3513 #pragma clang loop unroll(full)
3514 #elif defined(__GNUC__) && __GNUC__ >= 8
3515 #pragma GCC unroll MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX
3516 #endif
3517    for (unsigned i = 0; i < num_states; i++) {
3518       BITSET_SET(state, state_array[i]);
3519    }
3520 
3521    BITSET_DECLARE(temp, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX);
3522    BITSET_AND(temp, state, dynamic_state->dirty);
3523    return !BITSET_IS_EMPTY(temp);
3524 }
3525 
3526 template <chip CHIP>
3527 uint32_t
tu_emit_draw_state(struct tu_cmd_buffer * cmd)3528 tu_emit_draw_state(struct tu_cmd_buffer *cmd)
3529 {
3530    struct tu_cs cs;
3531    uint32_t dirty_draw_states = 0;
3532 
3533 #define EMIT_STATE(name)                                                      \
3534    emit_draw_state(&cmd->vk.dynamic_graphics_state, tu_##name##_state,        \
3535                    ARRAY_SIZE(tu_##name##_state))
3536 #define DRAW_STATE_COND(name, id, extra_cond, ...)                            \
3537    if ((EMIT_STATE(name) || (extra_cond)) &&                                  \
3538        !(cmd->state.pipeline_draw_states & (1u << id))) {                     \
3539       unsigned size = tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__);      \
3540       if (size > 0) {                                                         \
3541          tu_cs_begin_sub_stream(&cmd->sub_cs, size, &cs);                     \
3542          tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                             \
3543          cmd->state.dynamic_state[id] =                                       \
3544             tu_cs_end_draw_state(&cmd->sub_cs, &cs);                          \
3545       } else {                                                                \
3546          cmd->state.dynamic_state[id] = {};                                   \
3547       }                                                                       \
3548       dirty_draw_states |= (1u << id);                                        \
3549    }
3550 #define DRAW_STATE_FDM(name, id, ...)                                         \
3551    if ((EMIT_STATE(name) || (cmd->state.dirty & TU_CMD_DIRTY_FDM)) &&         \
3552        !(cmd->state.pipeline_draw_states & (1u << id))) {                     \
3553       if (cmd->state.shaders[MESA_SHADER_FRAGMENT]->fs.has_fdm) {             \
3554          tu_cs_set_writeable(&cmd->sub_cs, true);                             \
3555          tu6_emit_##name##_fdm(&cs, cmd, __VA_ARGS__);                        \
3556          cmd->state.dynamic_state[id] =                                       \
3557             tu_cs_end_draw_state(&cmd->sub_cs, &cs);                          \
3558          tu_cs_set_writeable(&cmd->sub_cs, false);                            \
3559       } else {                                                                \
3560          unsigned size = tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__);   \
3561          if (size > 0) {                                                      \
3562             tu_cs_begin_sub_stream(&cmd->sub_cs, size, &cs);                  \
3563             tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                          \
3564             cmd->state.dynamic_state[id] =                                    \
3565                tu_cs_end_draw_state(&cmd->sub_cs, &cs);                       \
3566          } else {                                                             \
3567             cmd->state.dynamic_state[id] = {};                                \
3568          }                                                                    \
3569          tu_cs_begin_sub_stream(&cmd->sub_cs,                                 \
3570                                 tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__),  \
3571                                 &cs);                                         \
3572          tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                             \
3573          cmd->state.dynamic_state[id] =                                       \
3574             tu_cs_end_draw_state(&cmd->sub_cs, &cs);                          \
3575       }                                                                       \
3576       dirty_draw_states |= (1u << id);                                        \
3577    }
3578 #define DRAW_STATE(name, id, ...) DRAW_STATE_COND(name, id, false, __VA_ARGS__)
3579 
3580    DRAW_STATE(vertex_input, TU_DYNAMIC_STATE_VERTEX_INPUT,
3581               cmd->vk.dynamic_graphics_state.vi);
3582 
3583    /* Vertex input stride is special because it's part of the vertex input in
3584     * the pipeline but a separate array when it's dynamic state so we have to
3585     * use two separate functions.
3586     */
3587 #define tu6_emit_vertex_stride tu6_emit_vertex_stride_dyn
3588 #define tu6_vertex_stride_size tu6_vertex_stride_size_dyn
3589 
3590    DRAW_STATE(vertex_stride, TU_DYNAMIC_STATE_VB_STRIDE,
3591               cmd->vk.dynamic_graphics_state.vi_binding_strides,
3592               cmd->vk.dynamic_graphics_state.vi_bindings_valid);
3593 
3594 #undef tu6_emit_vertex_stride
3595 #undef tu6_vertex_stride_size
3596 
3597    DRAW_STATE_FDM(viewport, TU_DYNAMIC_STATE_VIEWPORT,
3598                   &cmd->vk.dynamic_graphics_state.vp,
3599                   &cmd->vk.dynamic_graphics_state.rs);
3600    DRAW_STATE_FDM(scissor, TU_DYNAMIC_STATE_SCISSOR,
3601                   &cmd->vk.dynamic_graphics_state.vp);
3602    DRAW_STATE(sample_locations,
3603               TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
3604               cmd->vk.dynamic_graphics_state.ms.sample_locations_enable,
3605               cmd->vk.dynamic_graphics_state.ms.sample_locations);
3606    DRAW_STATE(depth_bias, TU_DYNAMIC_STATE_DEPTH_BIAS,
3607               &cmd->vk.dynamic_graphics_state.rs);
3608    DRAW_STATE(blend, TU_DYNAMIC_STATE_BLEND,
3609               &cmd->vk.dynamic_graphics_state.cb,
3610               cmd->vk.dynamic_graphics_state.ms.alpha_to_coverage_enable,
3611               cmd->vk.dynamic_graphics_state.ms.alpha_to_one_enable,
3612               cmd->vk.dynamic_graphics_state.ms.sample_mask);
3613    if (EMIT_STATE(blend_lrz) ||
3614        ((cmd->state.dirty & TU_CMD_DIRTY_SUBPASS) &&
3615         !cmd->state.pipeline_blend_lrz)) {
3616       bool blend_reads_dest = tu6_calc_blend_lrz(&cmd->vk.dynamic_graphics_state.cb,
3617                                                  &cmd->state.vk_rp);
3618       if (blend_reads_dest != cmd->state.blend_reads_dest) {
3619          cmd->state.blend_reads_dest = blend_reads_dest;
3620          cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
3621       }
3622    }
3623    if (EMIT_STATE(bandwidth) ||
3624        ((cmd->state.dirty & TU_CMD_DIRTY_SUBPASS) &&
3625         !cmd->state.pipeline_bandwidth))
3626       tu_calc_bandwidth(&cmd->state.bandwidth, &cmd->vk.dynamic_graphics_state.cb,
3627                         &cmd->state.vk_rp);
3628    DRAW_STATE(blend_constants, VK_DYNAMIC_STATE_BLEND_CONSTANTS,
3629               &cmd->vk.dynamic_graphics_state.cb);
3630    DRAW_STATE_COND(rast, TU_DYNAMIC_STATE_RAST,
3631                    cmd->state.dirty & (TU_CMD_DIRTY_SUBPASS |
3632                                        TU_CMD_DIRTY_PER_VIEW_VIEWPORT),
3633                    &cmd->vk.dynamic_graphics_state.rs,
3634                    &cmd->vk.dynamic_graphics_state.vp,
3635                    cmd->state.vk_rp.view_mask != 0,
3636                    cmd->state.per_view_viewport);
3637    DRAW_STATE_COND(ds, TU_DYNAMIC_STATE_DS,
3638               cmd->state.dirty & TU_CMD_DIRTY_SUBPASS,
3639               &cmd->vk.dynamic_graphics_state.ds,
3640               &cmd->state.vk_rp);
3641    DRAW_STATE_COND(rb_depth_cntl, TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
3642                    cmd->state.dirty & TU_CMD_DIRTY_SUBPASS,
3643                    &cmd->vk.dynamic_graphics_state.ds,
3644                    &cmd->state.vk_rp,
3645                    &cmd->vk.dynamic_graphics_state.rs);
3646    DRAW_STATE_COND(patch_control_points,
3647                    TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS,
3648                    cmd->state.dirty & TU_CMD_DIRTY_PROGRAM,
3649                    cmd->state.shaders[MESA_SHADER_VERTEX],
3650                    cmd->state.shaders[MESA_SHADER_TESS_CTRL],
3651                    cmd->state.shaders[MESA_SHADER_TESS_EVAL],
3652                    &cmd->state.program,
3653                    cmd->vk.dynamic_graphics_state.ts.patch_control_points);
3654    if (!cmd->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) {
3655       DRAW_STATE_COND(prim_mode_sysmem,
3656                       TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM,
3657                       cmd->state.dirty & (TU_CMD_DIRTY_RAST_ORDER |
3658                                           TU_CMD_DIRTY_FEEDBACK_LOOPS),
3659                       cmd->state.raster_order_attachment_access,
3660                       cmd->vk.dynamic_graphics_state.feedback_loops |
3661                       cmd->state.pipeline_feedback_loops,
3662                       &cmd->state.rp.sysmem_single_prim_mode);
3663    }
3664 #undef DRAW_STATE
3665 #undef DRAW_STATE_COND
3666 #undef EMIT_STATE
3667 
3668    return dirty_draw_states;
3669 }
3670 TU_GENX(tu_emit_draw_state);
3671 
3672 static void
tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3673 tu_pipeline_builder_parse_depth_stencil(
3674    struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3675 {
3676    const VkPipelineDepthStencilStateCreateInfo *ds_info =
3677       builder->create_info->pDepthStencilState;
3678 
3679    if ((builder->graphics_state.rp->attachments ==
3680         MESA_VK_RP_ATTACHMENT_INFO_INVALID) ||
3681        (builder->graphics_state.rp->attachments &
3682         MESA_VK_RP_ATTACHMENT_DEPTH_BIT)) {
3683       pipeline->ds.raster_order_attachment_access =
3684          ds_info && (ds_info->flags &
3685          (VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_EXT |
3686           VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_EXT));
3687    }
3688 }
3689 
3690 static void
tu_pipeline_builder_parse_multisample_and_color_blend(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3691 tu_pipeline_builder_parse_multisample_and_color_blend(
3692    struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3693 {
3694    /* The spec says:
3695     *
3696     *    pMultisampleState is a pointer to an instance of the
3697     *    VkPipelineMultisampleStateCreateInfo, and is ignored if the pipeline
3698     *    has rasterization disabled.
3699     *
3700     * Also,
3701     *
3702     *    pColorBlendState is a pointer to an instance of the
3703     *    VkPipelineColorBlendStateCreateInfo structure, and is ignored if the
3704     *    pipeline has rasterization disabled or if the subpass of the render
3705     *    pass the pipeline is created against does not use any color
3706     *    attachments.
3707     *
3708     * We leave the relevant registers stale when rasterization is disabled.
3709     */
3710    if (builder->rasterizer_discard) {
3711       return;
3712    }
3713 
3714    static const VkPipelineColorBlendStateCreateInfo dummy_blend_info = {};
3715 
3716    const VkPipelineColorBlendStateCreateInfo *blend_info =
3717       (builder->graphics_state.rp->attachments &
3718        MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)
3719       ? builder->create_info->pColorBlendState
3720       : &dummy_blend_info;
3721 
3722    if (builder->graphics_state.rp->attachments &
3723        MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS) {
3724       pipeline->output.raster_order_attachment_access =
3725          blend_info && (blend_info->flags &
3726             VK_PIPELINE_COLOR_BLEND_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_BIT_EXT);
3727    }
3728 }
3729 
3730 static void
tu_pipeline_builder_parse_rasterization_order(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3731 tu_pipeline_builder_parse_rasterization_order(
3732    struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3733 {
3734    if (builder->rasterizer_discard)
3735       return;
3736 
3737    bool raster_order_attachment_access =
3738       pipeline->output.raster_order_attachment_access ||
3739       pipeline->ds.raster_order_attachment_access ||
3740       TU_DEBUG(RAST_ORDER);
3741 
3742    /* VK_EXT_blend_operation_advanced would also require ordered access
3743     * when implemented in the future.
3744     */
3745 
3746    enum a6xx_single_prim_mode gmem_prim_mode = NO_FLUSH;
3747 
3748    if (raster_order_attachment_access) {
3749       /* VK_EXT_rasterization_order_attachment_access:
3750        *
3751        * This extension allow access to framebuffer attachments when used as
3752        * both input and color attachments from one fragment to the next,
3753        * in rasterization order, without explicit synchronization.
3754        */
3755       gmem_prim_mode = FLUSH_PER_OVERLAP;
3756    }
3757 
3758    struct tu_cs cs;
3759 
3760    pipeline->prim_order.state_gmem = tu_cs_draw_state(&pipeline->cs, &cs, 2);
3761    tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL,
3762                         A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
3763                         A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(gmem_prim_mode));
3764 }
3765 
3766 static void
tu_pipeline_finish(struct tu_pipeline * pipeline,struct tu_device * dev,const VkAllocationCallbacks * alloc)3767 tu_pipeline_finish(struct tu_pipeline *pipeline,
3768                    struct tu_device *dev,
3769                    const VkAllocationCallbacks *alloc)
3770 {
3771    tu_cs_finish(&pipeline->cs);
3772    TU_RMV(resource_destroy, dev, &pipeline->bo);
3773 
3774    mtx_lock(&dev->pipeline_mutex);
3775    tu_suballoc_bo_free(&dev->pipeline_suballoc, &pipeline->bo);
3776    mtx_unlock(&dev->pipeline_mutex);
3777 
3778    if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB) {
3779       struct tu_graphics_lib_pipeline *library =
3780          tu_pipeline_to_graphics_lib(pipeline);
3781 
3782       if (library->nir_shaders)
3783          vk_pipeline_cache_object_unref(&dev->vk,
3784                                         &library->nir_shaders->base);
3785 
3786       for (unsigned i = 0; i < library->num_sets; i++) {
3787          if (library->layouts[i])
3788             vk_descriptor_set_layout_unref(&dev->vk, &library->layouts[i]->vk);
3789       }
3790 
3791       vk_free2(&dev->vk.alloc, alloc, library->state_data);
3792    }
3793 
3794    for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); i++) {
3795       if (pipeline->shaders[i])
3796          vk_pipeline_cache_object_unref(&dev->vk,
3797                                         &pipeline->shaders[i]->base);
3798    }
3799 
3800    ralloc_free(pipeline->executables_mem_ctx);
3801 }
3802 
3803 static VkGraphicsPipelineLibraryFlagBitsEXT
vk_shader_stage_to_pipeline_library_flags(VkShaderStageFlagBits stage)3804 vk_shader_stage_to_pipeline_library_flags(VkShaderStageFlagBits stage)
3805 {
3806    assert(util_bitcount(stage) == 1);
3807    switch (stage) {
3808    case VK_SHADER_STAGE_VERTEX_BIT:
3809    case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
3810    case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
3811    case VK_SHADER_STAGE_GEOMETRY_BIT:
3812    case VK_SHADER_STAGE_TASK_BIT_EXT:
3813    case VK_SHADER_STAGE_MESH_BIT_EXT:
3814       return VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT;
3815    case VK_SHADER_STAGE_FRAGMENT_BIT:
3816       return VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT;
3817    default:
3818       unreachable("Invalid shader stage");
3819    }
3820 }
3821 
3822 template <chip CHIP>
3823 static VkResult
tu_pipeline_builder_build(struct tu_pipeline_builder * builder,struct tu_pipeline ** pipeline)3824 tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
3825                           struct tu_pipeline **pipeline)
3826 {
3827    VkResult result;
3828 
3829    if (builder->create_flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR) {
3830       *pipeline = (struct tu_pipeline *) vk_object_zalloc(
3831          &builder->device->vk, builder->alloc,
3832          sizeof(struct tu_graphics_lib_pipeline),
3833          VK_OBJECT_TYPE_PIPELINE);
3834       if (!*pipeline)
3835          return VK_ERROR_OUT_OF_HOST_MEMORY;
3836       (*pipeline)->type = TU_PIPELINE_GRAPHICS_LIB;
3837    } else {
3838       *pipeline = (struct tu_pipeline *) vk_object_zalloc(
3839          &builder->device->vk, builder->alloc,
3840          sizeof(struct tu_graphics_pipeline),
3841          VK_OBJECT_TYPE_PIPELINE);
3842       if (!*pipeline)
3843          return VK_ERROR_OUT_OF_HOST_MEMORY;
3844       (*pipeline)->type = TU_PIPELINE_GRAPHICS;
3845    }
3846 
3847    (*pipeline)->executables_mem_ctx = ralloc_context(NULL);
3848    util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx);
3849 
3850    tu_pipeline_builder_parse_libraries(builder, *pipeline);
3851 
3852    VkShaderStageFlags stages = 0;
3853    for (unsigned i = 0; i < builder->create_info->stageCount; i++) {
3854       VkShaderStageFlagBits stage = builder->create_info->pStages[i].stage;
3855 
3856       /* Ignore shader stages that don't need to be imported. */
3857       if (!(vk_shader_stage_to_pipeline_library_flags(stage) & builder->state))
3858          continue;
3859 
3860       stages |= stage;
3861    }
3862    builder->active_stages = stages;
3863 
3864    (*pipeline)->active_stages = stages;
3865    for (unsigned i = 0; i < builder->num_libraries; i++)
3866       (*pipeline)->active_stages |= builder->libraries[i]->base.active_stages;
3867 
3868    /* Compile and upload shaders unless a library has already done that. */
3869    if ((*pipeline)->program.vs_state.size == 0) {
3870       tu_pipeline_builder_parse_layout(builder, *pipeline);
3871 
3872       result = tu_pipeline_builder_compile_shaders(builder, *pipeline);
3873       if (result != VK_SUCCESS) {
3874          vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3875          return result;
3876       }
3877    }
3878 
3879    result = tu_pipeline_allocate_cs(builder->device, *pipeline,
3880                                     &builder->layout, builder, NULL);
3881 
3882 
3883    if (set_combined_state(builder, *pipeline,
3884                           VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
3885                           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
3886       if (result != VK_SUCCESS) {
3887          vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3888          return result;
3889       }
3890 
3891       tu_emit_program_state<CHIP>(&(*pipeline)->cs, &(*pipeline)->program,
3892                                   (*pipeline)->shaders);
3893 
3894       if (CHIP == A6XX) {
3895          /* Blob doesn't preload state on A7XX, likely preloading either
3896           * doesn't work or doesn't provide benefits.
3897           */
3898          tu6_emit_load_state(builder->device, *pipeline, &builder->layout);
3899       }
3900    }
3901 
3902    if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
3903       tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
3904    }
3905 
3906    if (builder->state &
3907        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
3908       tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
3909    }
3910 
3911    if (set_combined_state(builder, *pipeline,
3912                           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
3913                           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
3914       tu_pipeline_builder_parse_rasterization_order(builder, *pipeline);
3915    }
3916 
3917    tu_pipeline_builder_emit_state<CHIP>(builder, *pipeline);
3918 
3919    if ((*pipeline)->type == TU_PIPELINE_GRAPHICS_LIB) {
3920       struct tu_graphics_lib_pipeline *library =
3921          tu_pipeline_to_graphics_lib(*pipeline);
3922       result = vk_graphics_pipeline_state_copy(&builder->device->vk,
3923                                                &library->graphics_state,
3924                                                &builder->graphics_state,
3925                                                builder->alloc,
3926                                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT,
3927                                                &library->state_data);
3928       if (result != VK_SUCCESS) {
3929          tu_pipeline_finish(*pipeline, builder->device, builder->alloc);
3930          return result;
3931       }
3932    } else {
3933       struct tu_graphics_pipeline *gfx_pipeline =
3934          tu_pipeline_to_graphics(*pipeline);
3935       gfx_pipeline->dynamic_state.ms.sample_locations =
3936          &gfx_pipeline->sample_locations;
3937       vk_dynamic_graphics_state_fill(&gfx_pipeline->dynamic_state,
3938                                      &builder->graphics_state);
3939       gfx_pipeline->feedback_loops =
3940          vk_pipeline_flags_feedback_loops(builder->graphics_state.pipeline_flags);
3941       gfx_pipeline->feedback_loop_may_involve_textures =
3942          builder->graphics_state.feedback_loop_not_input_only;
3943    }
3944 
3945    return VK_SUCCESS;
3946 }
3947 
3948 static void
tu_pipeline_builder_finish(struct tu_pipeline_builder * builder)3949 tu_pipeline_builder_finish(struct tu_pipeline_builder *builder)
3950 {
3951    ralloc_free(builder->mem_ctx);
3952 }
3953 
3954 void
tu_fill_render_pass_state(struct vk_render_pass_state * rp,const struct tu_render_pass * pass,const struct tu_subpass * subpass)3955 tu_fill_render_pass_state(struct vk_render_pass_state *rp,
3956                           const struct tu_render_pass *pass,
3957                           const struct tu_subpass *subpass)
3958 {
3959    rp->view_mask = subpass->multiview_mask;
3960    rp->color_attachment_count = subpass->color_count;
3961 
3962    const uint32_t a = subpass->depth_stencil_attachment.attachment;
3963    rp->depth_attachment_format = VK_FORMAT_UNDEFINED;
3964    rp->stencil_attachment_format = VK_FORMAT_UNDEFINED;
3965    rp->attachments = MESA_VK_RP_ATTACHMENT_NONE;
3966    if (a != VK_ATTACHMENT_UNUSED) {
3967       VkFormat ds_format = pass->attachments[a].format;
3968       if (vk_format_has_depth(ds_format) && subpass->depth_used) {
3969          rp->depth_attachment_format = ds_format;
3970          rp->attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
3971       }
3972       if (vk_format_has_stencil(ds_format) && subpass->stencil_used) {
3973          rp->stencil_attachment_format = ds_format;
3974          rp->attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
3975       }
3976    }
3977 
3978    for (uint32_t i = 0; i < subpass->color_count; i++) {
3979       const uint32_t a = subpass->color_attachments[i].attachment;
3980       if (a == VK_ATTACHMENT_UNUSED) {
3981          rp->color_attachment_formats[i] = VK_FORMAT_UNDEFINED;
3982          continue;
3983       }
3984 
3985       rp->color_attachment_formats[i] = pass->attachments[a].format;
3986       rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
3987    }
3988 }
3989 
3990 static void
tu_pipeline_builder_init_graphics(struct tu_pipeline_builder * builder,struct tu_device * dev,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * create_info,VkPipelineCreateFlags2KHR flags,const VkAllocationCallbacks * alloc)3991 tu_pipeline_builder_init_graphics(
3992    struct tu_pipeline_builder *builder,
3993    struct tu_device *dev,
3994    struct vk_pipeline_cache *cache,
3995    const VkGraphicsPipelineCreateInfo *create_info,
3996    VkPipelineCreateFlags2KHR flags,
3997    const VkAllocationCallbacks *alloc)
3998 {
3999    *builder = (struct tu_pipeline_builder) {
4000       .device = dev,
4001       .mem_ctx = ralloc_context(NULL),
4002       .cache = cache,
4003       .alloc = alloc,
4004       .create_info = create_info,
4005       .create_flags = flags,
4006    };
4007 
4008    const VkGraphicsPipelineLibraryCreateInfoEXT *gpl_info =
4009       vk_find_struct_const(builder->create_info->pNext,
4010                            GRAPHICS_PIPELINE_LIBRARY_CREATE_INFO_EXT);
4011 
4012    const VkPipelineLibraryCreateInfoKHR *library_info =
4013       vk_find_struct_const(builder->create_info->pNext,
4014                            PIPELINE_LIBRARY_CREATE_INFO_KHR);
4015 
4016    if (gpl_info) {
4017       builder->state = gpl_info->flags;
4018    } else {
4019       /* Implement this bit of spec text:
4020        *
4021        *    If this structure is omitted, and either
4022        *    VkGraphicsPipelineCreateInfo::flags includes
4023        *    VK_PIPELINE_CREATE_LIBRARY_BIT_KHR or the
4024        *    VkGraphicsPipelineCreateInfo::pNext chain includes a
4025        *    VkPipelineLibraryCreateInfoKHR structure with a libraryCount
4026        *    greater than 0, it is as if flags is 0. Otherwise if this
4027        *    structure is omitted, it is as if flags includes all possible
4028        *    subsets of the graphics pipeline (i.e. a complete graphics
4029        *    pipeline).
4030        */
4031       if ((library_info && library_info->libraryCount > 0) ||
4032           (builder->create_flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR)) {
4033          builder->state = 0;
4034       } else {
4035          builder->state =
4036             VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT |
4037             VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
4038             VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
4039             VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT;
4040       }
4041    }
4042 
4043    bool rasterizer_discard_dynamic = false;
4044    if (create_info->pDynamicState) {
4045       for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) {
4046          if (create_info->pDynamicState->pDynamicStates[i] ==
4047                VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE) {
4048             rasterizer_discard_dynamic = true;
4049             break;
4050          }
4051       }
4052    }
4053 
4054    builder->rasterizer_discard =
4055       (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) &&
4056       !rasterizer_discard_dynamic &&
4057       builder->create_info->pRasterizationState->rasterizerDiscardEnable;
4058 
4059    struct vk_render_pass_state rp_state = {};
4060    const struct vk_render_pass_state *driver_rp = NULL;
4061    VkPipelineCreateFlags2KHR rp_flags = 0;
4062 
4063    builder->unscaled_input_fragcoord = 0;
4064 
4065    /* Extract information we need from the turnip renderpass. This will be
4066     * filled out automatically if the app is using dynamic rendering or
4067     * renderpasses are emulated.
4068     */
4069    if (!TU_DEBUG(DYNAMIC) &&
4070        (builder->state &
4071         (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
4072          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
4073          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) &&
4074        builder->create_info->renderPass) {
4075       const struct tu_render_pass *pass =
4076          tu_render_pass_from_handle(create_info->renderPass);
4077       const struct tu_subpass *subpass =
4078          &pass->subpasses[create_info->subpass];
4079 
4080       tu_fill_render_pass_state(&rp_state, pass, subpass);
4081 
4082       for (unsigned i = 0; i < subpass->input_count; i++) {
4083          /* Input attachments stored in GMEM must be loaded with unscaled
4084           * FragCoord.
4085           */
4086          if (subpass->input_attachments[i].patch_input_gmem)
4087             builder->unscaled_input_fragcoord |= 1u << i;
4088       }
4089 
4090       if (subpass->feedback_loop_color) {
4091          rp_flags |=
4092             VK_PIPELINE_CREATE_2_COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT;
4093       }
4094 
4095       if (subpass->feedback_loop_ds) {
4096          rp_flags |=
4097             VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT;
4098       }
4099 
4100       if (pass->fragment_density_map.attachment != VK_ATTACHMENT_UNUSED) {
4101          rp_flags |=
4102             VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT;
4103       }
4104 
4105       builder->unscaled_input_fragcoord = 0;
4106       for (unsigned i = 0; i < subpass->input_count; i++) {
4107          /* Input attachments stored in GMEM must be loaded with unscaled
4108           * FragCoord.
4109           */
4110          if (subpass->input_attachments[i].patch_input_gmem)
4111             builder->unscaled_input_fragcoord |= 1u << i;
4112       }
4113 
4114       driver_rp = &rp_state;
4115    }
4116 
4117    vk_graphics_pipeline_state_fill(&dev->vk,
4118                                    &builder->graphics_state,
4119                                    builder->create_info,
4120                                    driver_rp,
4121                                    rp_flags,
4122                                    &builder->all_state,
4123                                    NULL, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT,
4124                                    NULL);
4125 
4126    if (builder->graphics_state.rp) {
4127       builder->fragment_density_map = (builder->graphics_state.pipeline_flags &
4128          VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT) ||
4129          TU_DEBUG(FDM);
4130    }
4131 }
4132 
4133 template <chip CHIP>
4134 static VkResult
tu_graphics_pipeline_create(VkDevice device,VkPipelineCache pipelineCache,const VkGraphicsPipelineCreateInfo * pCreateInfo,VkPipelineCreateFlags2KHR flags,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)4135 tu_graphics_pipeline_create(VkDevice device,
4136                             VkPipelineCache pipelineCache,
4137                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
4138                             VkPipelineCreateFlags2KHR flags,
4139                             const VkAllocationCallbacks *pAllocator,
4140                             VkPipeline *pPipeline)
4141 {
4142    VK_FROM_HANDLE(tu_device, dev, device);
4143    VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
4144 
4145    cache = cache ? cache : dev->mem_cache;
4146 
4147    struct tu_pipeline_builder builder;
4148    tu_pipeline_builder_init_graphics(&builder, dev, cache,
4149                                      pCreateInfo, flags, pAllocator);
4150 
4151    struct tu_pipeline *pipeline = NULL;
4152    VkResult result = tu_pipeline_builder_build<CHIP>(&builder, &pipeline);
4153    tu_pipeline_builder_finish(&builder);
4154 
4155    if (result == VK_SUCCESS) {
4156       TU_RMV(graphics_pipeline_create, dev, tu_pipeline_to_graphics(pipeline));
4157 
4158       *pPipeline = tu_pipeline_to_handle(pipeline);
4159    } else
4160       *pPipeline = VK_NULL_HANDLE;
4161 
4162    return result;
4163 }
4164 
4165 template <chip CHIP>
4166 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateGraphicsPipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)4167 tu_CreateGraphicsPipelines(VkDevice device,
4168                            VkPipelineCache pipelineCache,
4169                            uint32_t count,
4170                            const VkGraphicsPipelineCreateInfo *pCreateInfos,
4171                            const VkAllocationCallbacks *pAllocator,
4172                            VkPipeline *pPipelines)
4173 {
4174    MESA_TRACE_FUNC();
4175    VkResult final_result = VK_SUCCESS;
4176    uint32_t i = 0;
4177 
4178    for (; i < count; i++) {
4179       VkPipelineCreateFlags2KHR flags =
4180          vk_graphics_pipeline_create_flags(&pCreateInfos[i]);
4181 
4182       VkResult result =
4183          tu_graphics_pipeline_create<CHIP>(device, pipelineCache,
4184                                            &pCreateInfos[i], flags,
4185                                            pAllocator, &pPipelines[i]);
4186 
4187       if (result != VK_SUCCESS) {
4188          final_result = result;
4189          pPipelines[i] = VK_NULL_HANDLE;
4190 
4191          if (flags & VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
4192             break;
4193       }
4194    }
4195 
4196    for (; i < count; i++)
4197       pPipelines[i] = VK_NULL_HANDLE;
4198 
4199    return final_result;
4200 }
4201 TU_GENX(tu_CreateGraphicsPipelines);
4202 
4203 template <chip CHIP>
4204 static VkResult
tu_compute_pipeline_create(VkDevice device,VkPipelineCache pipelineCache,const VkComputePipelineCreateInfo * pCreateInfo,VkPipelineCreateFlags2KHR flags,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)4205 tu_compute_pipeline_create(VkDevice device,
4206                            VkPipelineCache pipelineCache,
4207                            const VkComputePipelineCreateInfo *pCreateInfo,
4208                            VkPipelineCreateFlags2KHR flags,
4209                            const VkAllocationCallbacks *pAllocator,
4210                            VkPipeline *pPipeline)
4211 {
4212    VK_FROM_HANDLE(tu_device, dev, device);
4213    VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
4214    VK_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout);
4215    const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
4216    VkResult result;
4217    const struct ir3_shader_variant *v = NULL;
4218 
4219    cache = cache ? cache : dev->mem_cache;
4220 
4221    struct tu_compute_pipeline *pipeline;
4222 
4223    *pPipeline = VK_NULL_HANDLE;
4224 
4225    VkPipelineCreationFeedback pipeline_feedback = {
4226       .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
4227    };
4228 
4229    const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
4230       vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
4231 
4232    int64_t pipeline_start = os_time_get_nano();
4233 
4234    pipeline = (struct tu_compute_pipeline *) vk_object_zalloc(
4235       &dev->vk, pAllocator, sizeof(*pipeline), VK_OBJECT_TYPE_PIPELINE);
4236    if (!pipeline)
4237       return VK_ERROR_OUT_OF_HOST_MEMORY;
4238    pipeline->base.type = TU_PIPELINE_COMPUTE;
4239 
4240    pipeline->base.executables_mem_ctx = ralloc_context(NULL);
4241    util_dynarray_init(&pipeline->base.executables, pipeline->base.executables_mem_ctx);
4242    pipeline->base.active_stages = VK_SHADER_STAGE_COMPUTE_BIT;
4243 
4244    struct tu_shader_key key = { };
4245    bool allow_varying_subgroup_size =
4246       (stage_info->flags &
4247        VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT);
4248    bool require_full_subgroups =
4249       stage_info->flags &
4250       VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT;
4251    const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info =
4252       vk_find_struct_const(stage_info,
4253                            PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
4254    tu_shader_key_subgroup_size(&key, allow_varying_subgroup_size,
4255                                require_full_subgroups, subgroup_info,
4256                                dev);
4257 
4258    void *pipeline_mem_ctx = ralloc_context(NULL);
4259 
4260    unsigned char pipeline_sha1[20];
4261    tu_hash_compute(pipeline_sha1, flags, stage_info, layout, &key, dev->compiler);
4262 
4263    struct tu_shader *shader = NULL;
4264 
4265    const bool executable_info = flags &
4266       VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
4267 
4268    bool application_cache_hit = false;
4269 
4270    if (!executable_info) {
4271       shader =
4272          tu_pipeline_cache_lookup(cache, pipeline_sha1, sizeof(pipeline_sha1),
4273                                   &application_cache_hit);
4274    }
4275 
4276    if (application_cache_hit && cache != dev->mem_cache) {
4277       pipeline_feedback.flags |=
4278          VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
4279    }
4280 
4281    char *nir_initial_disasm = NULL;
4282 
4283    if (!shader) {
4284       if (flags &
4285           VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR) {
4286          result = VK_PIPELINE_COMPILE_REQUIRED;
4287          goto fail;
4288       }
4289 
4290       struct ir3_shader_key ir3_key = {};
4291 
4292       nir_shader *nir = tu_spirv_to_nir(dev, pipeline_mem_ctx, flags,
4293                                         stage_info, MESA_SHADER_COMPUTE);
4294 
4295       nir_initial_disasm = executable_info ?
4296          nir_shader_as_str(nir, pipeline->base.executables_mem_ctx) : NULL;
4297 
4298       result = tu_shader_create(dev, &shader, nir, &key, &ir3_key,
4299                                 pipeline_sha1, sizeof(pipeline_sha1), layout,
4300                                 executable_info);
4301       if (!shader) {
4302          goto fail;
4303       }
4304 
4305       shader = tu_pipeline_cache_insert(cache, shader);
4306    }
4307 
4308    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
4309 
4310    if (creation_feedback) {
4311       *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
4312       assert(creation_feedback->pipelineStageCreationFeedbackCount == 1);
4313       creation_feedback->pPipelineStageCreationFeedbacks[0] = pipeline_feedback;
4314    }
4315 
4316    pipeline->base.active_desc_sets = shader->active_desc_sets;
4317 
4318    v = shader->variant;
4319 
4320    tu_pipeline_set_linkage(&pipeline->base.program.link[MESA_SHADER_COMPUTE],
4321                            &shader->const_state, v);
4322 
4323    result = tu_pipeline_allocate_cs(dev, &pipeline->base, layout, NULL, v);
4324    if (result != VK_SUCCESS)
4325       goto fail;
4326 
4327    for (int i = 0; i < 3; i++)
4328       pipeline->local_size[i] = v->local_size[i];
4329 
4330    if (CHIP == A6XX) {
4331       tu6_emit_load_state(dev, &pipeline->base, layout);
4332    }
4333 
4334    tu_append_executable(&pipeline->base, v, nir_initial_disasm);
4335 
4336    pipeline->instrlen = v->instrlen;
4337 
4338    pipeline->base.shaders[MESA_SHADER_COMPUTE] = shader;
4339 
4340    ralloc_free(pipeline_mem_ctx);
4341 
4342    TU_RMV(compute_pipeline_create, dev, pipeline);
4343 
4344    *pPipeline = tu_pipeline_to_handle(&pipeline->base);
4345 
4346    return VK_SUCCESS;
4347 
4348 fail:
4349    if (shader)
4350       vk_pipeline_cache_object_unref(&dev->vk, &shader->base);
4351 
4352    ralloc_free(pipeline_mem_ctx);
4353 
4354    vk_object_free(&dev->vk, pAllocator, pipeline);
4355 
4356    return result;
4357 }
4358 
4359 template <chip CHIP>
4360 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateComputePipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)4361 tu_CreateComputePipelines(VkDevice device,
4362                           VkPipelineCache pipelineCache,
4363                           uint32_t count,
4364                           const VkComputePipelineCreateInfo *pCreateInfos,
4365                           const VkAllocationCallbacks *pAllocator,
4366                           VkPipeline *pPipelines)
4367 {
4368    MESA_TRACE_FUNC();
4369    VkResult final_result = VK_SUCCESS;
4370    uint32_t i = 0;
4371 
4372    for (; i < count; i++) {
4373       VkPipelineCreateFlags2KHR flags =
4374          vk_compute_pipeline_create_flags(&pCreateInfos[i]);
4375 
4376       VkResult result =
4377          tu_compute_pipeline_create<CHIP>(device, pipelineCache,
4378                                           &pCreateInfos[i], flags,
4379                                           pAllocator, &pPipelines[i]);
4380       if (result != VK_SUCCESS) {
4381          final_result = result;
4382          pPipelines[i] = VK_NULL_HANDLE;
4383 
4384          if (flags &
4385              VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
4386             break;
4387       }
4388    }
4389 
4390    for (; i < count; i++)
4391       pPipelines[i] = VK_NULL_HANDLE;
4392 
4393    return final_result;
4394 }
4395 TU_GENX(tu_CreateComputePipelines);
4396 
4397 VKAPI_ATTR void VKAPI_CALL
tu_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)4398 tu_DestroyPipeline(VkDevice _device,
4399                    VkPipeline _pipeline,
4400                    const VkAllocationCallbacks *pAllocator)
4401 {
4402    VK_FROM_HANDLE(tu_device, dev, _device);
4403    VK_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
4404 
4405    if (!_pipeline)
4406       return;
4407 
4408    TU_RMV(resource_destroy, dev, pipeline);
4409 
4410    tu_pipeline_finish(pipeline, dev, pAllocator);
4411    vk_object_free(&dev->vk, pAllocator, pipeline);
4412 }
4413 
4414 #define WRITE_STR(field, ...) ({                                \
4415    memset(field, 0, sizeof(field));                             \
4416    UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \
4417    assert(_i > 0 && _i < sizeof(field));                        \
4418 })
4419 
4420 static const struct tu_pipeline_executable *
tu_pipeline_get_executable(struct tu_pipeline * pipeline,uint32_t index)4421 tu_pipeline_get_executable(struct tu_pipeline *pipeline, uint32_t index)
4422 {
4423    assert(index < util_dynarray_num_elements(&pipeline->executables,
4424                                              struct tu_pipeline_executable));
4425    return util_dynarray_element(
4426       &pipeline->executables, struct tu_pipeline_executable, index);
4427 }
4428 
4429 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutablePropertiesKHR(VkDevice _device,const VkPipelineInfoKHR * pPipelineInfo,uint32_t * pExecutableCount,VkPipelineExecutablePropertiesKHR * pProperties)4430 tu_GetPipelineExecutablePropertiesKHR(
4431       VkDevice _device,
4432       const VkPipelineInfoKHR* pPipelineInfo,
4433       uint32_t* pExecutableCount,
4434       VkPipelineExecutablePropertiesKHR* pProperties)
4435 {
4436    VK_FROM_HANDLE(tu_device, dev, _device);
4437    VK_FROM_HANDLE(tu_pipeline, pipeline, pPipelineInfo->pipeline);
4438    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
4439                           pProperties, pExecutableCount);
4440 
4441    util_dynarray_foreach (&pipeline->executables, struct tu_pipeline_executable, exe) {
4442       vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
4443          gl_shader_stage stage = exe->stage;
4444          props->stages = mesa_to_vk_shader_stage(stage);
4445 
4446          if (!exe->is_binning)
4447             WRITE_STR(props->name, "%s", _mesa_shader_stage_to_abbrev(stage));
4448          else
4449             WRITE_STR(props->name, "Binning VS");
4450 
4451          WRITE_STR(props->description, "%s", _mesa_shader_stage_to_string(stage));
4452 
4453          props->subgroupSize =
4454             dev->compiler->threadsize_base * (exe->stats.double_threadsize ? 2 : 1);
4455       }
4456    }
4457 
4458    return vk_outarray_status(&out);
4459 }
4460 
4461 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableStatisticsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pStatisticCount,VkPipelineExecutableStatisticKHR * pStatistics)4462 tu_GetPipelineExecutableStatisticsKHR(
4463       VkDevice _device,
4464       const VkPipelineExecutableInfoKHR* pExecutableInfo,
4465       uint32_t* pStatisticCount,
4466       VkPipelineExecutableStatisticKHR* pStatistics)
4467 {
4468    VK_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
4469    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
4470                           pStatistics, pStatisticCount);
4471 
4472    const struct tu_pipeline_executable *exe =
4473       tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
4474 
4475    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4476       WRITE_STR(stat->name, "Max Waves Per Core");
4477       WRITE_STR(stat->description,
4478                 "Maximum number of simultaneous waves per core.");
4479       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4480       stat->value.u64 = exe->stats.max_waves;
4481    }
4482 
4483    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4484       WRITE_STR(stat->name, "Instruction Count");
4485       WRITE_STR(stat->description,
4486                 "Total number of IR3 instructions in the final generated "
4487                 "shader executable.");
4488       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4489       stat->value.u64 = exe->stats.instrs_count;
4490    }
4491 
4492    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4493       WRITE_STR(stat->name, "Code size");
4494       WRITE_STR(stat->description,
4495                 "Total number of dwords in the final generated "
4496                 "shader executable.");
4497       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4498       stat->value.u64 = exe->stats.sizedwords;
4499    }
4500 
4501    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4502       WRITE_STR(stat->name, "NOPs Count");
4503       WRITE_STR(stat->description,
4504                 "Number of NOP instructions in the final generated "
4505                 "shader executable.");
4506       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4507       stat->value.u64 = exe->stats.nops_count;
4508    }
4509 
4510    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4511       WRITE_STR(stat->name, "MOV Count");
4512       WRITE_STR(stat->description,
4513                 "Number of MOV instructions in the final generated "
4514                 "shader executable.");
4515       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4516       stat->value.u64 = exe->stats.mov_count;
4517    }
4518 
4519    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4520       WRITE_STR(stat->name, "COV Count");
4521       WRITE_STR(stat->description,
4522                 "Number of COV instructions in the final generated "
4523                 "shader executable.");
4524       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4525       stat->value.u64 = exe->stats.cov_count;
4526    }
4527 
4528    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4529       WRITE_STR(stat->name, "Registers used");
4530       WRITE_STR(stat->description,
4531                 "Number of registers used in the final generated "
4532                 "shader executable.");
4533       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4534       stat->value.u64 = exe->stats.max_reg + 1;
4535    }
4536 
4537    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4538       WRITE_STR(stat->name, "Half-registers used");
4539       WRITE_STR(stat->description,
4540                 "Number of half-registers used in the final generated "
4541                 "shader executable.");
4542       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4543       stat->value.u64 = exe->stats.max_half_reg + 1;
4544    }
4545 
4546    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4547       WRITE_STR(stat->name, "Last interpolation instruction");
4548       WRITE_STR(stat->description,
4549                 "The instruction where varying storage in Local Memory is released");
4550       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4551       stat->value.u64 = exe->stats.last_baryf;
4552    }
4553 
4554    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4555       WRITE_STR(stat->name, "Last helper instruction");
4556       WRITE_STR(stat->description,
4557                 "The instruction where helper invocations are killed");
4558       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4559       stat->value.u64 = exe->stats.last_helper;
4560    }
4561 
4562    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4563       WRITE_STR(stat->name, "Instructions with SS sync bit");
4564       WRITE_STR(stat->description,
4565                 "SS bit is set for instructions which depend on a result "
4566                 "of \"long\" instructions to prevent RAW hazard.");
4567       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4568       stat->value.u64 = exe->stats.ss;
4569    }
4570 
4571    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4572       WRITE_STR(stat->name, "Instructions with SY sync bit");
4573       WRITE_STR(stat->description,
4574                 "SY bit is set for instructions which depend on a result "
4575                 "of loads from global memory to prevent RAW hazard.");
4576       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4577       stat->value.u64 = exe->stats.sy;
4578    }
4579 
4580    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4581       WRITE_STR(stat->name, "Estimated cycles stalled on SS");
4582       WRITE_STR(stat->description,
4583                 "A better metric to estimate the impact of SS syncs.");
4584       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4585       stat->value.u64 = exe->stats.sstall;
4586    }
4587 
4588    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4589       WRITE_STR(stat->name, "Estimated cycles stalled on SY");
4590       WRITE_STR(stat->description,
4591                 "A better metric to estimate the impact of SY syncs.");
4592       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4593       stat->value.u64 = exe->stats.systall;
4594    }
4595 
4596    for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) {
4597       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4598          WRITE_STR(stat->name, "cat%d instructions", i);
4599          WRITE_STR(stat->description,
4600                   "Number of cat%d instructions.", i);
4601          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4602          stat->value.u64 = exe->stats.instrs_per_cat[i];
4603       }
4604    }
4605 
4606    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4607       WRITE_STR(stat->name, "STP Count");
4608       WRITE_STR(stat->description,
4609                 "Number of STore Private instructions in the final generated "
4610                 "shader executable.");
4611       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4612       stat->value.u64 = exe->stats.stp_count;
4613    }
4614 
4615    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4616       WRITE_STR(stat->name, "LDP Count");
4617       WRITE_STR(stat->description,
4618                 "Number of LoaD Private instructions in the final generated "
4619                 "shader executable.");
4620       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4621       stat->value.u64 = exe->stats.ldp_count;
4622    }
4623 
4624    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4625       WRITE_STR(stat->name, "Early preamble");
4626       WRITE_STR(stat->description,
4627                 "Whether the preamble will be executed early.");
4628       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_BOOL32_KHR;
4629       stat->value.b32 = exe->stats.early_preamble;
4630    }
4631 
4632    return vk_outarray_status(&out);
4633 }
4634 
4635 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)4636 write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
4637               const char *data)
4638 {
4639    ir->isText = VK_TRUE;
4640 
4641    size_t data_len = strlen(data) + 1;
4642 
4643    if (ir->pData == NULL) {
4644       ir->dataSize = data_len;
4645       return true;
4646    }
4647 
4648    strncpy((char *) ir->pData, data, ir->dataSize);
4649    if (ir->dataSize < data_len)
4650       return false;
4651 
4652    ir->dataSize = data_len;
4653    return true;
4654 }
4655 
4656 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableInternalRepresentationsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pInternalRepresentationCount,VkPipelineExecutableInternalRepresentationKHR * pInternalRepresentations)4657 tu_GetPipelineExecutableInternalRepresentationsKHR(
4658     VkDevice _device,
4659     const VkPipelineExecutableInfoKHR* pExecutableInfo,
4660     uint32_t* pInternalRepresentationCount,
4661     VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
4662 {
4663    VK_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
4664    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
4665                           pInternalRepresentations, pInternalRepresentationCount);
4666    bool incomplete_text = false;
4667 
4668    const struct tu_pipeline_executable *exe =
4669       tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
4670 
4671    if (exe->nir_from_spirv) {
4672       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4673          WRITE_STR(ir->name, "NIR from SPIRV");
4674          WRITE_STR(ir->description,
4675                    "Initial NIR before any optimizations");
4676 
4677          if (!write_ir_text(ir, exe->nir_from_spirv))
4678             incomplete_text = true;
4679       }
4680    }
4681 
4682    if (exe->nir_final) {
4683       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4684          WRITE_STR(ir->name, "Final NIR");
4685          WRITE_STR(ir->description,
4686                    "Final NIR before going into the back-end compiler");
4687 
4688          if (!write_ir_text(ir, exe->nir_final))
4689             incomplete_text = true;
4690       }
4691    }
4692 
4693    if (exe->disasm) {
4694       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4695          WRITE_STR(ir->name, "IR3 Assembly");
4696          WRITE_STR(ir->description,
4697                    "Final IR3 assembly for the generated shader binary");
4698 
4699          if (!write_ir_text(ir, exe->disasm))
4700             incomplete_text = true;
4701       }
4702    }
4703 
4704    return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
4705 }
4706