xref: /aosp_15_r20/external/mesa3d/src/intel/blorp/blorp_genX_exec_brw.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #ifndef BLORP_GENX_EXEC_BRW_H
25 #define BLORP_GENX_EXEC_BRW_H
26 
27 #include "blorp_priv.h"
28 #include "dev/intel_device_info.h"
29 #include "common/intel_compute_slm.h"
30 #include "common/intel_sample_positions.h"
31 #include "common/intel_l3_config.h"
32 #include "genxml/gen_macros.h"
33 #include "intel/compiler/brw_compiler.h"
34 
35 /**
36  * This file provides the blorp pipeline setup and execution functionality.
37  * It defines the following function:
38  *
39  * static void
40  * blorp_exec(struct blorp_context *blorp, void *batch_data,
41  *            const struct blorp_params *params);
42  *
43  * It is the job of whoever includes this header to wrap this in something
44  * to get an externally visible symbol.
45  *
46  * In order for the blorp_exec function to work, the driver must provide
47  * implementations of the following static helper functions.
48  */
49 
50 static void *
51 blorp_emit_dwords(struct blorp_batch *batch, unsigned n);
52 
53 static uint64_t
54 blorp_emit_reloc(struct blorp_batch *batch,
55                  void *location, struct blorp_address address, uint32_t delta);
56 
57 static void
58 blorp_measure_start(struct blorp_batch *batch,
59                     const struct blorp_params *params);
60 
61 static void
62 blorp_measure_end(struct blorp_batch *batch,
63                   const struct blorp_params *params);
64 
65 static void *
66 blorp_alloc_dynamic_state(struct blorp_batch *batch,
67                           uint32_t size,
68                           uint32_t alignment,
69                           uint32_t *offset);
70 
71 UNUSED static void *
72 blorp_alloc_general_state(struct blorp_batch *batch,
73                           uint32_t size,
74                           uint32_t alignment,
75                           uint32_t *offset);
76 
77 static uint32_t
78 blorp_get_dynamic_state(struct blorp_batch *batch,
79                         enum blorp_dynamic_state name);
80 
81 static void *
82 blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
83                           struct blorp_address *addr);
84 static void
85 blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
86                                            const struct blorp_address *addrs,
87                                            uint32_t *sizes,
88                                            unsigned num_vbs);
89 
90 UNUSED static struct blorp_address
91 blorp_get_workaround_address(struct blorp_batch *batch);
92 
93 static bool
94 blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
95                           unsigned state_size, unsigned state_alignment,
96                           uint32_t *bt_offset, uint32_t *surface_offsets,
97                           void **surface_maps);
98 
99 static uint32_t
100 blorp_binding_table_offset_to_pointer(struct blorp_batch *batch,
101                                       uint32_t offset);
102 
103 static void
104 blorp_flush_range(struct blorp_batch *batch, void *start, size_t size);
105 
106 static void
107 blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
108                     struct blorp_address address, uint32_t delta);
109 
110 static uint64_t
111 blorp_get_surface_address(struct blorp_batch *batch,
112                           struct blorp_address address);
113 
114 #if GFX_VER < 10
115 static struct blorp_address
116 blorp_get_surface_base_address(struct blorp_batch *batch);
117 #endif
118 
119 static const struct intel_l3_config *
120 blorp_get_l3_config(struct blorp_batch *batch);
121 
122 static void
123 blorp_pre_emit_urb_config(struct blorp_batch *batch,
124                           struct intel_urb_config *urb_config);
125 
126 static void
127 blorp_emit_urb_config(struct blorp_batch *batch,
128                       struct intel_urb_config *urb_config);
129 
130 static void
131 blorp_emit_pipeline(struct blorp_batch *batch,
132                     const struct blorp_params *params);
133 
134 static void
135 blorp_emit_pre_draw(struct blorp_batch *batch,
136                     const struct blorp_params *params);
137 static void
138 blorp_emit_post_draw(struct blorp_batch *batch,
139                      const struct blorp_params *params);
140 
141 static inline unsigned
brw_blorp_get_urb_length(const struct brw_wm_prog_data * prog_data)142 brw_blorp_get_urb_length(const struct brw_wm_prog_data *prog_data)
143 {
144    if (prog_data == NULL)
145       return 1;
146 
147    /* From the BSpec: 3D Pipeline - Strips and Fans - 3DSTATE_SBE
148     *
149     * read_length = ceiling((max_source_attr+1)/2)
150     */
151    return MAX2((prog_data->num_varying_inputs + 1) / 2, 1);
152 }
153 
154 /***** BEGIN blorp_exec implementation ******/
155 
156 static uint64_t
_blorp_combine_address(struct blorp_batch * batch,void * location,struct blorp_address address,uint32_t delta)157 _blorp_combine_address(struct blorp_batch *batch, void *location,
158                        struct blorp_address address, uint32_t delta)
159 {
160    if (address.buffer == NULL) {
161       return address.offset + delta;
162    } else {
163       return blorp_emit_reloc(batch, location, address, delta);
164    }
165 }
166 
167 #define __gen_address_type struct blorp_address
168 #define __gen_user_data struct blorp_batch
169 #define __gen_combine_address _blorp_combine_address
170 
171 #include "genxml/genX_pack.h"
172 #include "common/intel_genX_state_brw.h"
173 
174 #define _blorp_cmd_length(cmd) cmd ## _length
175 #define _blorp_cmd_length_bias(cmd) cmd ## _length_bias
176 #define _blorp_cmd_header(cmd) cmd ## _header
177 #define _blorp_cmd_pack(cmd) cmd ## _pack
178 
179 #define blorp_emit(batch, cmd, name)                              \
180    for (struct cmd name = { _blorp_cmd_header(cmd) },             \
181         *_dst = blorp_emit_dwords(batch, _blorp_cmd_length(cmd)); \
182         __builtin_expect(_dst != NULL, 1);                        \
183         _blorp_cmd_pack(cmd)(batch, (void *)_dst, &name),         \
184         _dst = NULL)
185 
186 #define blorp_emitn(batch, cmd, n, ...) ({                  \
187       uint32_t *_dw = blorp_emit_dwords(batch, n);          \
188       if (_dw) {                                            \
189          struct cmd template = {                            \
190             _blorp_cmd_header(cmd),                         \
191             .DWordLength = n - _blorp_cmd_length_bias(cmd), \
192             __VA_ARGS__                                     \
193          };                                                 \
194          _blorp_cmd_pack(cmd)(batch, _dw, &template);       \
195       }                                                     \
196       _dw ? _dw + 1 : NULL; /* Array starts at dw[1] */     \
197    })
198 
199 #define STRUCT_ZERO(S) ({ struct S t; memset(&t, 0, sizeof(t)); t; })
200 
201 #define blorp_context_upload_dynamic(context, state, name,              \
202                                      align, dynamic_name)               \
203    for (struct state name = STRUCT_ZERO(state), *_dst = &name;          \
204         _dst != NULL;                                                   \
205         ({                                                              \
206            uint32_t _dw[_blorp_cmd_length(state)];                      \
207            _blorp_cmd_pack(state)(NULL, (void *)_dw, &name);            \
208            context->upload_dynamic_state(context, _dw,                  \
209                                          _blorp_cmd_length(state) * 4,  \
210                                          align, dynamic_name);          \
211            _dst = NULL;                                                 \
212         }))
213 
214 #define blorp_emit_dynamic(batch, state, name, align, offset)           \
215    for (struct state name = STRUCT_ZERO(state),                         \
216         *_dst = blorp_alloc_dynamic_state(batch,                   \
217                                           _blorp_cmd_length(state) * 4, \
218                                           align, offset);               \
219         __builtin_expect(_dst != NULL, 1);                              \
220         _blorp_cmd_pack(state)(batch, (void *)_dst, &name),             \
221         blorp_flush_range(batch, _dst, _blorp_cmd_length(state) * 4),   \
222         _dst = NULL)
223 
224 /* 3DSTATE_URB
225  * 3DSTATE_URB_VS
226  * 3DSTATE_URB_HS
227  * 3DSTATE_URB_DS
228  * 3DSTATE_URB_GS
229  *
230  * Assign the entire URB to the VS. Even though the VS disabled, URB space
231  * is still needed because the clipper loads the VUE's from the URB. From
232  * the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE,
233  * Dword 1.15:0 "VS Number of URB Entries":
234  *     This field is always used (even if VS Function Enable is DISABLED).
235  *
236  * The warning below appears in the PRM (Section 3DSTATE_URB), but we can
237  * safely ignore it because this batch contains only one draw call.
238  *     Because of URB corruption caused by allocating a previous GS unit
239  *     URB entry to the VS unit, software is required to send a “GS NULL
240  *     Fence” (Send URB fence with VS URB size == 1 and GS URB size == 0)
241  *     plus a dummy DRAW call before any case where VS will be taking over
242  *     GS URB space.
243  *
244  * If the 3DSTATE_URB_VS is emitted, than the others must be also.
245  * From the Ivybridge PRM, Volume 2 Part 1, section 1.7.1 3DSTATE_URB_VS:
246  *
247  *     3DSTATE_URB_HS, 3DSTATE_URB_DS, and 3DSTATE_URB_GS must also be
248  *     programmed in order for the programming of this state to be
249  *     valid.
250  */
251 static void
emit_urb_config(struct blorp_batch * batch,const struct blorp_params * params,UNUSED enum intel_urb_deref_block_size * deref_block_size)252 emit_urb_config(struct blorp_batch *batch,
253                 const struct blorp_params *params,
254                 UNUSED enum intel_urb_deref_block_size *deref_block_size)
255 {
256    /* Once vertex fetcher has written full VUE entries with complete
257     * header the space requirement is as follows per vertex (in bytes):
258     *
259     *     Header    Position    Program constants
260     *   +--------+------------+-------------------+
261     *   |   16   |     16     |      n x 16       |
262     *   +--------+------------+-------------------+
263     *
264     * where 'n' stands for number of varying inputs expressed as vec4s.
265     */
266    struct brw_wm_prog_data *wm_prog_data = params->wm_prog_data;
267    const unsigned num_varyings =
268       wm_prog_data ? wm_prog_data->num_varying_inputs : 0;
269    const unsigned total_needed = 16 + 16 + num_varyings * 16;
270 
271    /* The URB size is expressed in units of 64 bytes (512 bits) */
272    const unsigned vs_entry_size = DIV_ROUND_UP(total_needed, 64);
273 
274    struct intel_urb_config urb_cfg = {
275       .size = { vs_entry_size, 1, 1, 1 },
276    };
277 
278    bool constrained;
279    intel_get_urb_config(batch->blorp->compiler->brw->devinfo,
280                         blorp_get_l3_config(batch),
281                         false, false, &urb_cfg,
282                         deref_block_size, &constrained);
283 
284    /* Tell drivers about the config. */
285    blorp_pre_emit_urb_config(batch, &urb_cfg);
286 
287    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
288       blorp_emit(batch, GENX(3DSTATE_URB_VS), urb) {
289          urb._3DCommandSubOpcode      += i;
290          urb.VSURBStartingAddress      = urb_cfg.start[i];
291          urb.VSURBEntryAllocationSize  = urb_cfg.size[i] - 1;
292          urb.VSNumberofURBEntries      = urb_cfg.entries[i];
293       }
294    }
295 
296    if (batch->blorp->config.use_mesh_shading) {
297 #if GFX_VERx10 >= 125
298       blorp_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
299       blorp_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
300 #endif
301    }
302 }
303 
304 static void
305 blorp_emit_memcpy(struct blorp_batch *batch,
306                   struct blorp_address dst,
307                   struct blorp_address src,
308                   uint32_t size);
309 
310 static void
blorp_emit_vertex_data(struct blorp_batch * batch,const struct blorp_params * params,struct blorp_address * addr,uint32_t * size)311 blorp_emit_vertex_data(struct blorp_batch *batch,
312                        const struct blorp_params *params,
313                        struct blorp_address *addr,
314                        uint32_t *size)
315 {
316    const float vertices[] = {
317       /* v0 */ (float)params->x1, (float)params->y1, params->z,
318       /* v1 */ (float)params->x0, (float)params->y1, params->z,
319       /* v2 */ (float)params->x0, (float)params->y0, params->z,
320    };
321 
322    void *data = blorp_alloc_vertex_buffer(batch, sizeof(vertices), addr);
323    if (data == NULL)
324       return;
325    memcpy(data, vertices, sizeof(vertices));
326    *size = sizeof(vertices);
327    blorp_flush_range(batch, data, *size);
328 }
329 
330 static void
blorp_emit_input_varying_data(struct blorp_batch * batch,const struct blorp_params * params,struct blorp_address * addr,uint32_t * size)331 blorp_emit_input_varying_data(struct blorp_batch *batch,
332                               const struct blorp_params *params,
333                               struct blorp_address *addr,
334                               uint32_t *size)
335 {
336    const unsigned vec4_size_in_bytes = 4 * sizeof(float);
337    const unsigned max_num_varyings =
338       DIV_ROUND_UP(sizeof(params->wm_inputs), vec4_size_in_bytes);
339    struct brw_wm_prog_data *wm_prog_data = params->wm_prog_data;
340    const unsigned num_varyings =
341       wm_prog_data ? wm_prog_data->num_varying_inputs : 0;
342 
343    *size = 16 + num_varyings * vec4_size_in_bytes;
344 
345    const uint32_t *const inputs_src = (const uint32_t *)&params->wm_inputs;
346    void *data = blorp_alloc_vertex_buffer(batch, *size, addr);
347    if (data == NULL)
348       return;
349    uint32_t *inputs = data;
350 
351    /* Copy in the VS inputs */
352    assert(sizeof(params->vs_inputs) == 16);
353    memcpy(inputs, &params->vs_inputs, sizeof(params->vs_inputs));
354    inputs += 4;
355 
356    if (params->wm_prog_data) {
357       /* Walk over the attribute slots, determine if the attribute is used by
358        * the program and when necessary copy the values from the input storage
359        * to the vertex data buffer.
360        */
361       for (unsigned i = 0; i < max_num_varyings; i++) {
362          const gl_varying_slot attr = VARYING_SLOT_VAR0 + i;
363 
364          const int input_index = wm_prog_data->urb_setup[attr];
365          if (input_index < 0)
366             continue;
367 
368          memcpy(inputs, inputs_src + i * 4, vec4_size_in_bytes);
369 
370          inputs += 4;
371       }
372    }
373 
374    blorp_flush_range(batch, data, *size);
375 
376    if (params->dst_clear_color_as_input) {
377       /* In this case, the clear color isn't known statically and instead
378        * comes in through an indirect which we have to copy into the vertex
379        * buffer before we execute the 3DPRIMITIVE.  We already copied the
380        * value of params->wm_inputs.clear_color into the vertex buffer in the
381        * loop above.  Now we emit code to stomp it from the GPU with the
382        * actual clear color value.
383        */
384       assert(num_varyings == 1);
385 
386       /* The clear color is the first thing after the header */
387       struct blorp_address clear_color_input_addr = *addr;
388       clear_color_input_addr.offset += 16;
389 
390       const unsigned clear_color_size =
391          GFX_VER < 10 ? batch->blorp->isl_dev->ss.clear_value_size : 4 * 4;
392       blorp_emit_memcpy(batch, clear_color_input_addr,
393                         params->dst.clear_color_addr,
394                         clear_color_size);
395    }
396 }
397 
398 static void
blorp_fill_vertex_buffer_state(struct GENX (VERTEX_BUFFER_STATE)* vb,unsigned idx,struct blorp_address addr,uint32_t size,uint32_t stride)399 blorp_fill_vertex_buffer_state(struct GENX(VERTEX_BUFFER_STATE) *vb,
400                                unsigned idx,
401                                struct blorp_address addr, uint32_t size,
402                                uint32_t stride)
403 {
404    vb[idx].VertexBufferIndex = idx;
405    vb[idx].BufferStartingAddress = addr;
406    vb[idx].BufferPitch = stride;
407    vb[idx].MOCS = addr.mocs;
408    vb[idx].AddressModifyEnable = true;
409    vb[idx].BufferSize = size;
410 
411 #if GFX_VER >= 12
412    vb[idx].L3BypassDisable = true;
413 #endif
414 }
415 
416 static void
blorp_emit_vertex_buffers(struct blorp_batch * batch,const struct blorp_params * params)417 blorp_emit_vertex_buffers(struct blorp_batch *batch,
418                           const struct blorp_params *params)
419 {
420    struct GENX(VERTEX_BUFFER_STATE) vb[2] = {};
421    const uint32_t num_vbs = ARRAY_SIZE(vb);
422 
423    struct blorp_address addrs[2] = {};
424    uint32_t sizes[2] = {};
425    blorp_emit_vertex_data(batch, params, &addrs[0], &sizes[0]);
426    if (sizes[0] == 0)
427       return;
428    blorp_fill_vertex_buffer_state(vb, 0, addrs[0], sizes[0],
429                                   3 * sizeof(float));
430 
431    blorp_emit_input_varying_data(batch, params, &addrs[1], &sizes[1]);
432    blorp_fill_vertex_buffer_state(vb, 1, addrs[1], sizes[1], 0);
433 
434    blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, sizes, num_vbs);
435 
436    const unsigned num_dwords = 1 + num_vbs * GENX(VERTEX_BUFFER_STATE_length);
437    uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
438    if (!dw)
439       return;
440 
441    for (unsigned i = 0; i < num_vbs; i++) {
442       GENX(VERTEX_BUFFER_STATE_pack)(batch, dw, &vb[i]);
443       dw += GENX(VERTEX_BUFFER_STATE_length);
444    }
445 }
446 
447 static void
blorp_emit_vertex_elements(struct blorp_batch * batch,const struct blorp_params * params)448 blorp_emit_vertex_elements(struct blorp_batch *batch,
449                            const struct blorp_params *params)
450 {
451    struct brw_wm_prog_data *wm_prog_data = params->wm_prog_data;
452    const unsigned num_varyings =
453       wm_prog_data ? wm_prog_data->num_varying_inputs : 0;
454    const unsigned num_elements = 2 + num_varyings;
455 
456    struct GENX(VERTEX_ELEMENT_STATE) ve[num_elements];
457    memset(ve, 0, num_elements * sizeof(*ve));
458 
459    /* Setup VBO for the rectangle primitive..
460     *
461     * A rectangle primitive (3DPRIM_RECTLIST) consists of only three
462     * vertices. The vertices reside in screen space with DirectX
463     * coordinates (that is, (0, 0) is the upper left corner).
464     *
465     *   v2 ------ implied
466     *    |        |
467     *    |        |
468     *   v1 ----- v0
469     *
470     * Since the VS is disabled, the clipper loads each VUE directly from
471     * the URB. This is controlled by the 3DSTATE_VERTEX_BUFFERS and
472     * 3DSTATE_VERTEX_ELEMENTS packets below. The VUE contents are as follows:
473     *   dw0: Reserved, MBZ.
474     *   dw1: Render Target Array Index. Below vertex fetcher gets programmed
475     *        to assign this with primitive instance identifier which will be
476     *        used for layered clears. All other renders have only one instance
477     *        and therefore the value will be effectively zero.
478     *   dw2: Viewport Index. The HiZ op disables viewport mapping and
479     *        scissoring, so set the dword to 0.
480     *   dw3: Point Width: The HiZ op does not emit the POINTLIST primitive,
481     *        so set the dword to 0.
482     *   dw4: Vertex Position X.
483     *   dw5: Vertex Position Y.
484     *   dw6: Vertex Position Z.
485     *   dw7: Vertex Position W.
486     *
487     *   dw8: Flat vertex input 0
488     *   dw9: Flat vertex input 1
489     *   ...
490     *   dwn: Flat vertex input n - 8
491     *
492     * For details, see the Sandybridge PRM, Volume 2, Part 1, Section 1.5.1
493     * "Vertex URB Entry (VUE) Formats".
494     *
495     * Only vertex position X and Y are going to be variable, Z is fixed to
496     * zero and W to one. Header words dw0,2,3 are zero. There is no need to
497     * include the fixed values in the vertex buffer. Vertex fetcher can be
498     * instructed to fill vertex elements with constant values of one and zero
499     * instead of reading them from the buffer.
500     * Flat inputs are program constants that are not interpolated. Moreover
501     * their values will be the same between vertices.
502     *
503     * See the vertex element setup below.
504     */
505    unsigned slot = 0;
506 
507    ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
508       .VertexBufferIndex = 1,
509       .Valid = true,
510       .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
511       .SourceElementOffset = 0,
512       .Component0Control = VFCOMP_STORE_SRC,
513 
514       /* From Gfx8 onwards hardware is no more instructed to overwrite
515        * components using an element specifier. Instead one has separate
516        * 3DSTATE_VF_SGVS (System Generated Value Setup) state packet for it.
517        */
518       .Component1Control = VFCOMP_STORE_0,
519       .Component2Control = VFCOMP_STORE_0,
520       .Component3Control = VFCOMP_STORE_0,
521    };
522    slot++;
523 
524    ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
525       .VertexBufferIndex = 0,
526       .Valid = true,
527       .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
528       .SourceElementOffset = 0,
529       .Component0Control = VFCOMP_STORE_SRC,
530       .Component1Control = VFCOMP_STORE_SRC,
531       .Component2Control = VFCOMP_STORE_SRC,
532       .Component3Control = VFCOMP_STORE_1_FP,
533    };
534    slot++;
535 
536    for (unsigned i = 0; i < num_varyings; ++i) {
537       ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
538          .VertexBufferIndex = 1,
539          .Valid = true,
540          .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
541          .SourceElementOffset = 16 + i * 4 * sizeof(float),
542          .Component0Control = VFCOMP_STORE_SRC,
543          .Component1Control = VFCOMP_STORE_SRC,
544          .Component2Control = VFCOMP_STORE_SRC,
545          .Component3Control = VFCOMP_STORE_SRC,
546       };
547       slot++;
548    }
549 
550    const unsigned num_dwords =
551       1 + GENX(VERTEX_ELEMENT_STATE_length) * num_elements;
552    uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_ELEMENTS), num_dwords);
553    if (!dw)
554       return;
555 
556    for (unsigned i = 0; i < num_elements; i++) {
557       GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw, &ve[i]);
558       dw += GENX(VERTEX_ELEMENT_STATE_length);
559    }
560 
561    blorp_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
562       vf.StatisticsEnable = false;
563    }
564 
565    /* Overwrite Render Target Array Index (2nd dword) in the VUE header with
566     * primitive instance identifier. This is used for layered clears.
567     */
568    blorp_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs) {
569       sgvs.InstanceIDEnable = true;
570       sgvs.InstanceIDComponentNumber = COMP_1;
571       sgvs.InstanceIDElementOffset = 0;
572    }
573 
574 #if GFX_VER >= 11
575    blorp_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs);
576 #endif
577 
578    for (unsigned i = 0; i < num_elements; i++) {
579       blorp_emit(batch, GENX(3DSTATE_VF_INSTANCING), vf) {
580          vf.VertexElementIndex = i;
581          vf.InstancingEnable = false;
582       }
583    }
584 
585    blorp_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
586       topo.PrimitiveTopologyType = _3DPRIM_RECTLIST;
587    }
588 }
589 
590 /* 3DSTATE_VIEWPORT_STATE_POINTERS */
591 static uint32_t
blorp_emit_cc_viewport(struct blorp_batch * batch)592 blorp_emit_cc_viewport(struct blorp_batch *batch)
593 {
594    uint32_t cc_vp_offset;
595 
596    /* Somehow reusing CC_VIEWPORT on Gfx9 is causing issues :
597     *    https://gitlab.freedesktop.org/mesa/mesa/-/issues/11647
598     */
599    if (GFX_VER != 9 && batch->blorp->config.use_cached_dynamic_states) {
600       cc_vp_offset = blorp_get_dynamic_state(batch, BLORP_DYNAMIC_STATE_CC_VIEWPORT);
601    } else {
602       blorp_emit_dynamic(batch, GENX(CC_VIEWPORT), vp, 32, &cc_vp_offset) {
603          vp.MinimumDepth = batch->blorp->config.use_unrestricted_depth_range ?
604                            -FLT_MAX : 0.0;
605          vp.MaximumDepth = batch->blorp->config.use_unrestricted_depth_range ?
606                            FLT_MAX : 1.0;
607       }
608    }
609 
610    blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), vsp) {
611       vsp.CCViewportPointer = cc_vp_offset;
612    }
613 
614    return cc_vp_offset;
615 }
616 
617 static uint32_t
blorp_emit_sampler_state(struct blorp_batch * batch)618 blorp_emit_sampler_state(struct blorp_batch *batch)
619 {
620    uint32_t offset;
621    blorp_emit_dynamic(batch, GENX(SAMPLER_STATE), sampler, 32, &offset) {
622       sampler.MipModeFilter = MIPFILTER_NONE;
623       sampler.MagModeFilter = MAPFILTER_LINEAR;
624       sampler.MinModeFilter = MAPFILTER_LINEAR;
625       sampler.MinLOD = 0;
626       sampler.MaxLOD = 0;
627       sampler.TCXAddressControlMode = TCM_CLAMP;
628       sampler.TCYAddressControlMode = TCM_CLAMP;
629       sampler.TCZAddressControlMode = TCM_CLAMP;
630       sampler.MaximumAnisotropy = RATIO21;
631       sampler.RAddressMinFilterRoundingEnable = true;
632       sampler.RAddressMagFilterRoundingEnable = true;
633       sampler.VAddressMinFilterRoundingEnable = true;
634       sampler.VAddressMagFilterRoundingEnable = true;
635       sampler.UAddressMinFilterRoundingEnable = true;
636       sampler.UAddressMagFilterRoundingEnable = true;
637       sampler.NonnormalizedCoordinateEnable = true;
638    }
639 
640    return offset;
641 }
642 
643 UNUSED static uint32_t
blorp_emit_sampler_state_ps(struct blorp_batch * batch)644 blorp_emit_sampler_state_ps(struct blorp_batch *batch)
645 {
646    uint32_t offset = batch->blorp->config.use_cached_dynamic_states ?
647       blorp_get_dynamic_state(batch, BLORP_DYNAMIC_STATE_SAMPLER) :
648       blorp_emit_sampler_state(batch);
649 
650    blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_PS), ssp) {
651       ssp.PointertoPSSamplerState = offset;
652    }
653 
654    return offset;
655 }
656 
657 /* What follows is the code for setting up a "pipeline". */
658 
659 static void
blorp_emit_vs_config(struct blorp_batch * batch,const struct blorp_params * params)660 blorp_emit_vs_config(struct blorp_batch *batch,
661                      const struct blorp_params *params)
662 {
663    struct brw_vs_prog_data *vs_prog_data = params->vs_prog_data;
664    assert(!vs_prog_data || GFX_VER < 11 ||
665           vs_prog_data->base.dispatch_mode == INTEL_DISPATCH_MODE_SIMD8);
666 
667    blorp_emit(batch, GENX(3DSTATE_VS), vs) {
668       if (vs_prog_data) {
669          vs.Enable = true;
670 
671          vs.KernelStartPointer = params->vs_prog_kernel;
672 
673          vs.DispatchGRFStartRegisterForURBData =
674             vs_prog_data->base.base.dispatch_grf_start_reg;
675          vs.VertexURBEntryReadLength =
676             vs_prog_data->base.urb_read_length;
677          vs.VertexURBEntryReadOffset = 0;
678 
679          vs.MaximumNumberofThreads =
680             batch->blorp->isl_dev->info->max_vs_threads - 1;
681 
682          assert(vs_prog_data->base.dispatch_mode == INTEL_DISPATCH_MODE_SIMD8);
683 #if GFX_VER < 20
684          vs.SIMD8DispatchEnable = true;
685 #endif
686       }
687    }
688 }
689 
690 static void
blorp_emit_sf_config(struct blorp_batch * batch,const struct blorp_params * params,UNUSED enum intel_urb_deref_block_size urb_deref_block_size)691 blorp_emit_sf_config(struct blorp_batch *batch,
692                      const struct blorp_params *params,
693                      UNUSED enum intel_urb_deref_block_size urb_deref_block_size)
694 {
695    const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
696 
697    /* 3DSTATE_SF
698     *
699     * Disable ViewportTransformEnable (dw2.1)
700     *
701     * From the SandyBridge PRM, Volume 2, Part 1, Section 1.3, "3D
702     * Primitives Overview":
703     *     RECTLIST: Viewport Mapping must be DISABLED (as is typical with the
704     *     use of screen- space coordinates).
705     *
706     * A solid rectangle must be rendered, so set FrontFaceFillMode (dw2.4:3)
707     * and BackFaceFillMode (dw2.5:6) to SOLID(0).
708     *
709     * From the Sandy Bridge PRM, Volume 2, Part 1, Section
710     * 6.4.1.1 3DSTATE_SF, Field FrontFaceFillMode:
711     *     SOLID: Any triangle or rectangle object found to be front-facing
712     *     is rendered as a solid object. This setting is required when
713     *     (rendering rectangle (RECTLIST) objects.
714     */
715 
716    blorp_emit(batch, GENX(3DSTATE_SF), sf) {
717 #if GFX_VER >= 12
718       sf.DerefBlockSize = urb_deref_block_size;
719 #endif
720    }
721 
722    blorp_emit(batch, GENX(3DSTATE_RASTER), raster) {
723       raster.CullMode = CULLMODE_NONE;
724    }
725 
726    blorp_emit(batch, GENX(3DSTATE_SBE), sbe) {
727       sbe.VertexURBEntryReadOffset = 1;
728       if (prog_data) {
729          sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
730          sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
731          sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
732       } else {
733          sbe.NumberofSFOutputAttributes = 0;
734          sbe.VertexURBEntryReadLength = 1;
735       }
736       sbe.ForceVertexURBEntryReadLength = true;
737       sbe.ForceVertexURBEntryReadOffset = true;
738 
739       for (unsigned i = 0; i < 32; i++)
740          sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
741    }
742 }
743 
744 static void
blorp_emit_ps_config(struct blorp_batch * batch,const struct blorp_params * params)745 blorp_emit_ps_config(struct blorp_batch *batch,
746                      const struct blorp_params *params)
747 {
748    const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
749 
750    /* Even when thread dispatch is disabled, max threads (dw5.25:31) must be
751     * nonzero to prevent the GPU from hanging.  While the documentation doesn't
752     * mention this explicitly, it notes that the valid range for the field is
753     * [1,39] = [2,40] threads, which excludes zero.
754     *
755     * To be safe (and to minimize extraneous code) we go ahead and fully
756     * configure the WM state whether or not there is a WM program.
757     */
758 
759    const struct intel_device_info *devinfo = batch->blorp->compiler->brw->devinfo;
760 
761    blorp_emit(batch, GENX(3DSTATE_WM), wm);
762 
763    blorp_emit(batch, GENX(3DSTATE_PS), ps) {
764       if (params->src.enabled) {
765          ps.SamplerCount = 1; /* Up to 4 samplers */
766          ps.BindingTableEntryCount = 2;
767       } else {
768          ps.BindingTableEntryCount = 1;
769       }
770 
771       /* SAMPLER_STATE prefetching is broken on Gfx11 - Wa_1606682166 */
772       if (GFX_VER == 11)
773          ps.SamplerCount = 0;
774 
775       /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
776        * for pre Gfx11 and 128 for gfx11+; On gfx11+ If a programmed value is
777        * k, it implies 2(k+1) threads. It implicitly scales for different GT
778        * levels (which have some # of PSDs).
779        */
780       ps.MaximumNumberofThreadsPerPSD = devinfo->max_threads_per_psd - 1;
781 
782       switch (params->fast_clear_op) {
783       case ISL_AUX_OP_NONE:
784          break;
785 #if GFX_VER < 20
786 #if GFX_VER >= 10
787       case ISL_AUX_OP_AMBIGUATE:
788          ps.RenderTargetFastClearEnable = true;
789          ps.RenderTargetResolveType = FAST_CLEAR_0;
790          break;
791 #endif /* GFX_VER >= 10 */
792       case ISL_AUX_OP_PARTIAL_RESOLVE:
793          ps.RenderTargetResolveType = RESOLVE_PARTIAL;
794          break;
795       case ISL_AUX_OP_FULL_RESOLVE:
796          /* WA 1406738321:
797           * In-place full resolve of a 3D/Volume surface is not supported.
798           * In order to fully resolve 3D/volume surface, copy operation must be
799           * performed to a new destination (declared as uncompressed) using the
800           * compressed 3D surface as a source.
801           */
802 #if GFX_VERx10 == 120
803          assert(params->src.surf.dim != ISL_SURF_DIM_3D);
804 #endif
805          ps.RenderTargetResolveType = RESOLVE_FULL;
806          break;
807 #endif /* GFX_VER < 20 */
808       case ISL_AUX_OP_FAST_CLEAR:
809          /* WA 1406738321:
810           * 3D/Volumetric surfaces do not support Fast Clear operation.
811           */
812 #if GFX_VERx10 == 120
813          assert(params->dst.surf.dim != ISL_SURF_DIM_3D);
814 #endif
815          ps.RenderTargetFastClearEnable = true;
816          break;
817       default:
818          unreachable("Invalid fast clear op");
819       }
820 
821       /* The RENDER_SURFACE_STATE page for TGL says:
822        *
823        *   For an 8 bpp surface with NUM_MULTISAMPLES = 1, Surface Width not
824        *   multiple of 64 pixels and more than 1 mip level in the view, Fast
825        *   Clear is not supported when AUX_CCS_E is set in this field.
826        *
827        * The granularity of a fast-clear or ambiguate operation is likely one
828        * CCS element. For an 8 bpp primary surface, this maps to 32px x 4rows.
829        * Due to the surface layout parameters, if LOD0's width isn't a
830        * multiple of 64px, LOD1 and LOD2+ will share CCS elements. Assert that
831        * these operations aren't occurring on these LODs.
832        *
833        * We don't explicitly check for TGL+ because the restriction is
834        * technically applicable to all hardware. Platforms prior to TGL don't
835        * support CCS on 8 bpp surfaces. So, these unaligned fast clear
836        * operations shouldn't be occurring prior to TGL as well.
837        */
838       if (isl_format_get_layout(params->dst.surf.format)->bpb == 8 &&
839           params->dst.surf.logical_level0_px.width % 64 != 0 &&
840           params->dst.surf.levels >= 3 &&
841           params->dst.view.base_level >= 1) {
842          assert(params->num_samples == 1);
843          assert(!ps.RenderTargetFastClearEnable);
844       }
845 
846       if (prog_data) {
847          intel_set_ps_dispatch_state(&ps, devinfo, prog_data,
848                                      params->num_samples,
849                                      0 /* msaa_flags */);
850 
851          ps.DispatchGRFStartRegisterForConstantSetupData0 =
852             brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
853          ps.DispatchGRFStartRegisterForConstantSetupData1 =
854             brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
855 #if GFX_VER < 20
856          ps.DispatchGRFStartRegisterForConstantSetupData2 =
857             brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
858 #endif
859 
860          ps.KernelStartPointer0 = params->wm_prog_kernel +
861                                   brw_wm_prog_data_prog_offset(prog_data, ps, 0);
862          ps.KernelStartPointer1 = params->wm_prog_kernel +
863                                   brw_wm_prog_data_prog_offset(prog_data, ps, 1);
864 #if GFX_VER < 20
865          ps.KernelStartPointer2 = params->wm_prog_kernel +
866                                   brw_wm_prog_data_prog_offset(prog_data, ps, 2);
867 #endif
868       }
869    }
870 
871    blorp_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) {
872       if (params->src.enabled)
873          psx.PixelShaderKillsPixel = true;
874 
875       if (prog_data) {
876          psx.PixelShaderValid = true;
877          psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
878          psx.PixelShaderComputesStencil = prog_data->computed_stencil;
879          psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
880 
881 #if INTEL_WA_18038825448_GFX_VER
882          psx.EnablePSDependencyOnCPsizeChange =
883             batch->flags & BLORP_BATCH_FORCE_CPS_DEPENDENCY;
884 #endif
885 
886 #if GFX_VER < 20
887          psx.AttributeEnable = prog_data->num_varying_inputs > 0;
888 #else
889          /* Bspec 57340 (r59562):
890           *
891           *   For MSAA fast clear, it (clear shader) must be in per-pixel
892           *   dispatch mode.
893           *
894           * Bspec 56424 (r58933):
895           *
896           *   Bit 6 of Bit Group 0: Pixel Shader Is Per Sample
897           *   If this bit is DISABLED, the dispatch rate is determined by the
898           *   value of Pixel Shader Is Per Coarse Pixel.
899           *
900           *   Bit 4 of Bit Group 0: Pixel Shader Is Per Coarse Pixel
901           *   If Pixel Shader Is Per Sample is DISABLED and this bit is
902           *   DISABLED, the pixel shader is dispatched at the per pixel
903           *   shading rate.
904           *
905           * The below assertion ensures the MSAA clear shader is in per-pixel
906           * dispatch mode.
907           */
908          if (params->fast_clear_op == ISL_AUX_OP_FAST_CLEAR &&
909              params->num_samples > 1) {
910             assert(!psx.PixelShaderIsPerSample &&
911                    !psx.PixelShaderIsPerCoarsePixel);
912          }
913 #endif
914       }
915    }
916 }
917 
918 static void
blorp_emit_blend_state(struct blorp_batch * batch,const struct blorp_params * params)919 blorp_emit_blend_state(struct blorp_batch *batch,
920                        const struct blorp_params *params)
921 {
922    uint32_t offset;
923    if (!batch->blorp->config.use_cached_dynamic_states) {
924       struct GENX(BLEND_STATE) blend = { };
925 
926       const unsigned size = 96;
927       uint32_t *state = blorp_alloc_dynamic_state(batch, size, 64, &offset);
928       if (state == NULL)
929          return;
930       uint32_t *pos = state;
931 
932       GENX(BLEND_STATE_pack)(NULL, pos, &blend);
933       pos += GENX(BLEND_STATE_length);
934 
935       for (unsigned i = 0; i < params->num_draw_buffers; ++i) {
936          struct GENX(BLEND_STATE_ENTRY) entry = {
937             .PreBlendColorClampEnable = true,
938             .PostBlendColorClampEnable = true,
939             .ColorClampRange = COLORCLAMP_RTFORMAT,
940 
941             .WriteDisableRed = params->color_write_disable & 1,
942             .WriteDisableGreen = params->color_write_disable & 2,
943             .WriteDisableBlue = params->color_write_disable & 4,
944             .WriteDisableAlpha = params->color_write_disable & 8,
945          };
946          GENX(BLEND_STATE_ENTRY_pack)(NULL, pos, &entry);
947          pos += GENX(BLEND_STATE_ENTRY_length);
948       }
949 
950       blorp_flush_range(batch, state, size);
951    } else {
952       /* We only cached this case. */
953       assert(params->color_write_disable == 0);
954       offset = blorp_get_dynamic_state(batch, BLORP_DYNAMIC_STATE_BLEND);
955    }
956 
957    blorp_emit(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), sp) {
958       sp.BlendStatePointer = offset;
959       sp.BlendStatePointerValid = true;
960    }
961 
962    blorp_emit(batch, GENX(3DSTATE_PS_BLEND), ps_blend) {
963       ps_blend.HasWriteableRT = true;
964    }
965 }
966 
967 static void
blorp_emit_color_calc_state(struct blorp_batch * batch,UNUSED const struct blorp_params * params)968 blorp_emit_color_calc_state(struct blorp_batch *batch,
969                             UNUSED const struct blorp_params *params)
970 {
971    uint32_t offset;
972 
973    if (batch->blorp->config.use_cached_dynamic_states)
974       offset = blorp_get_dynamic_state(batch, BLORP_DYNAMIC_STATE_COLOR_CALC);
975    else
976       blorp_emit_dynamic(batch, GENX(COLOR_CALC_STATE), cc, 64, &offset) {}
977 
978    blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), sp) {
979       sp.ColorCalcStatePointer = offset;
980       sp.ColorCalcStatePointerValid = true;
981    }
982 }
983 
984 static void
blorp_emit_depth_stencil_state(struct blorp_batch * batch,const struct blorp_params * params)985 blorp_emit_depth_stencil_state(struct blorp_batch *batch,
986                                const struct blorp_params *params)
987 {
988    blorp_emit(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds) {
989       if (params->depth.enabled) {
990          ds.DepthBufferWriteEnable = true;
991 
992          switch (params->hiz_op) {
993          /* See the following sections of the Sandy Bridge PRM, Volume 2, Part1:
994           *   - 7.5.3.1 Depth Buffer Clear
995           *   - 7.5.3.2 Depth Buffer Resolve
996           *   - 7.5.3.3 Hierarchical Depth Buffer Resolve
997           */
998          case ISL_AUX_OP_FULL_RESOLVE:
999             ds.DepthTestEnable = true;
1000             ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
1001             break;
1002 
1003          case ISL_AUX_OP_NONE:
1004          case ISL_AUX_OP_FAST_CLEAR:
1005          case ISL_AUX_OP_AMBIGUATE:
1006             ds.DepthTestEnable = false;
1007             break;
1008          case ISL_AUX_OP_PARTIAL_RESOLVE:
1009             unreachable("Invalid HIZ op");
1010          }
1011       }
1012 
1013       if (params->stencil.enabled) {
1014          ds.StencilBufferWriteEnable = true;
1015          ds.StencilTestEnable = true;
1016          ds.DoubleSidedStencilEnable = false;
1017 
1018          ds.StencilTestFunction = COMPAREFUNCTION_ALWAYS;
1019          ds.StencilPassDepthPassOp = STENCILOP_REPLACE;
1020 
1021          ds.StencilWriteMask = params->stencil_mask;
1022          ds.StencilReferenceValue = params->stencil_ref;
1023       }
1024    }
1025 
1026 #if GFX_VER >= 12
1027    blorp_emit(batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
1028       db.DepthBoundsTestEnable = false;
1029       db.DepthBoundsTestMinValue = 0.0;
1030       db.DepthBoundsTestMaxValue = 1.0;
1031    }
1032 #endif
1033 }
1034 
1035 static void
blorp_emit_3dstate_multisample(struct blorp_batch * batch,const struct blorp_params * params)1036 blorp_emit_3dstate_multisample(struct blorp_batch *batch,
1037                                const struct blorp_params *params)
1038 {
1039    blorp_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
1040       ms.NumberofMultisamples       = __builtin_ffs(params->num_samples) - 1;
1041       ms.PixelLocation              = CENTER;
1042    }
1043 }
1044 
1045 static void
blorp_emit_pipeline(struct blorp_batch * batch,const struct blorp_params * params)1046 blorp_emit_pipeline(struct blorp_batch *batch,
1047                     const struct blorp_params *params)
1048 {
1049    enum intel_urb_deref_block_size urb_deref_block_size;
1050    emit_urb_config(batch, params, &urb_deref_block_size);
1051 
1052    if (params->wm_prog_data) {
1053       blorp_emit_blend_state(batch, params);
1054    }
1055    blorp_emit_color_calc_state(batch, params);
1056    blorp_emit_depth_stencil_state(batch, params);
1057 
1058    UNUSED uint32_t mocs = isl_mocs(batch->blorp->isl_dev, 0, false);
1059 
1060 #if GFX_VER >= 12
1061    blorp_emit(batch, GENX(3DSTATE_CONSTANT_ALL), pc) {
1062       /* Update empty push constants for all stages (bitmask = 11111b) */
1063       pc.ShaderUpdateEnable = 0x1f;
1064       pc.MOCS = mocs;
1065    }
1066 #else
1067    blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), xs) { xs.MOCS = mocs; }
1068    blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), xs) { xs.MOCS = mocs; }
1069    blorp_emit(batch, GENX(3DSTATE_CONSTANT_DS), xs) { xs.MOCS = mocs; }
1070    blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), xs) { xs.MOCS = mocs; }
1071    blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), xs) { xs.MOCS = mocs; }
1072 #endif
1073 
1074    if (params->src.enabled)
1075       blorp_emit_sampler_state_ps(batch);
1076 
1077    blorp_emit_3dstate_multisample(batch, params);
1078 
1079    blorp_emit(batch, GENX(3DSTATE_SAMPLE_MASK), mask) {
1080       mask.SampleMask = (1 << params->num_samples) - 1;
1081    }
1082 
1083    /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
1084     * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
1085     *
1086     *   [DevSNB] A pipeline flush must be programmed prior to a
1087     *   3DSTATE_VS command that causes the VS Function Enable to
1088     *   toggle. Pipeline flush can be executed by sending a PIPE_CONTROL
1089     *   command with CS stall bit set and a post sync operation.
1090     *
1091     * We've already done one at the start of the BLORP operation.
1092     */
1093    blorp_emit_vs_config(batch, params);
1094    blorp_emit(batch, GENX(3DSTATE_HS), hs);
1095    blorp_emit(batch, GENX(3DSTATE_TE), te);
1096    blorp_emit(batch, GENX(3DSTATE_DS), DS);
1097    blorp_emit(batch, GENX(3DSTATE_STREAMOUT), so);
1098    blorp_emit(batch, GENX(3DSTATE_GS), gs);
1099 
1100    blorp_emit(batch, GENX(3DSTATE_CLIP), clip) {
1101       clip.PerspectiveDivideDisable = true;
1102    }
1103 
1104    blorp_emit_sf_config(batch, params, urb_deref_block_size);
1105    blorp_emit_ps_config(batch, params);
1106 
1107    blorp_emit_cc_viewport(batch);
1108 
1109 #if GFX_VER >= 12
1110    /* Disable Primitive Replication. */
1111    blorp_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
1112 #endif
1113 
1114    if (batch->blorp->config.use_mesh_shading) {
1115 #if GFX_VERx10 >= 125
1116       blorp_emit(batch, GENX(3DSTATE_MESH_CONTROL), zero);
1117       blorp_emit(batch, GENX(3DSTATE_TASK_CONTROL), zero);
1118 #endif
1119    }
1120 }
1121 
1122 /******** This is the end of the pipeline setup code ********/
1123 
1124 static void
blorp_emit_memcpy(struct blorp_batch * batch,struct blorp_address dst,struct blorp_address src,uint32_t size)1125 blorp_emit_memcpy(struct blorp_batch *batch,
1126                   struct blorp_address dst,
1127                   struct blorp_address src,
1128                   uint32_t size)
1129 {
1130    assert(size % 4 == 0);
1131 
1132    for (unsigned dw = 0; dw < size; dw += 4) {
1133       blorp_emit(batch, GENX(MI_COPY_MEM_MEM), cp) {
1134          cp.DestinationMemoryAddress = dst;
1135          cp.SourceMemoryAddress = src;
1136       }
1137       dst.offset += 4;
1138       src.offset += 4;
1139    }
1140 }
1141 
1142 static void
blorp_emit_surface_state(struct blorp_batch * batch,const struct blorp_surface_info * surface,UNUSED enum isl_aux_op aux_op,void * state,uint32_t state_offset,uint8_t color_write_disable,bool is_render_target)1143 blorp_emit_surface_state(struct blorp_batch *batch,
1144                          const struct blorp_surface_info *surface,
1145                          UNUSED enum isl_aux_op aux_op,
1146                          void *state, uint32_t state_offset,
1147                          uint8_t color_write_disable,
1148                          bool is_render_target)
1149 {
1150    const struct isl_device *isl_dev = batch->blorp->isl_dev;
1151    struct isl_surf surf = surface->surf;
1152 
1153    if (surf.dim == ISL_SURF_DIM_1D &&
1154        surf.dim_layout == ISL_DIM_LAYOUT_GFX4_2D) {
1155       assert(surf.logical_level0_px.height == 1);
1156       surf.dim = ISL_SURF_DIM_2D;
1157    }
1158 
1159    if (isl_aux_usage_has_hiz(surface->aux_usage)) {
1160       /* BLORP doesn't render with depth so we can't use HiZ */
1161       assert(!is_render_target);
1162       /* We can't reinterpret HiZ */
1163       assert(surface->surf.format == surface->view.format);
1164    }
1165 
1166    enum isl_aux_usage aux_usage = surface->aux_usage;
1167 
1168    /* On gfx12, implicit CCS has no aux buffer */
1169    bool use_aux_address = (aux_usage != ISL_AUX_USAGE_NONE) &&
1170                           (surface->aux_addr.buffer != NULL);
1171 
1172    const bool use_clear_address =
1173       GFX_VER >= 10 && (surface->clear_color_addr.buffer != NULL);
1174 
1175    /* On gfx12 (and optionally on gfx11), hardware will read and write to the
1176     * clear color address, converting the raw clear color channels to a pixel
1177     * during a fast-clear. To avoid the restrictions associated with the
1178     * hardware feature, we instead write a software-converted pixel ourselves.
1179     * If we're performing a fast-clear, provide a substitute address to avoid
1180     * a collision with hardware. Outside of gfx11 and gfx12, indirect clear
1181     * color BOs are not used during fast-clears.
1182     */
1183    const struct blorp_address op_clear_addr =
1184       aux_op == ISL_AUX_OP_FAST_CLEAR ? blorp_get_workaround_address(batch) :
1185                                         surface->clear_color_addr;
1186 
1187    isl_surf_fill_state(batch->blorp->isl_dev, state,
1188                        .surf = &surf, .view = &surface->view,
1189                        .aux_surf = &surface->aux_surf, .aux_usage = aux_usage,
1190                        .address =
1191                           blorp_get_surface_address(batch, surface->addr),
1192                        .aux_address = !use_aux_address ? 0 :
1193                           blorp_get_surface_address(batch, surface->aux_addr),
1194                        .clear_address = !use_clear_address ? 0 :
1195                           blorp_get_surface_address(batch, op_clear_addr),
1196                        .mocs = surface->addr.mocs,
1197                        .clear_color = surface->clear_color,
1198                        .use_clear_address = use_clear_address);
1199 
1200    blorp_surface_reloc(batch, state_offset + isl_dev->ss.addr_offset,
1201                        surface->addr, 0);
1202 
1203    if (use_aux_address) {
1204       /* On gfx7 and prior, the bottom 12 bits of the MCS base address are
1205        * used to store other information.  This should be ok, however, because
1206        * surface buffer addresses are always 4K page alinged.
1207        */
1208       assert((surface->aux_addr.offset & 0xfff) == 0);
1209       uint32_t *aux_addr = state + isl_dev->ss.aux_addr_offset;
1210       blorp_surface_reloc(batch, state_offset + isl_dev->ss.aux_addr_offset,
1211                           surface->aux_addr, *aux_addr);
1212    }
1213 
1214    if (aux_usage != ISL_AUX_USAGE_NONE && surface->clear_color_addr.buffer) {
1215 #if GFX_VER >= 10
1216       assert((surface->clear_color_addr.offset & 0x3f) == 0);
1217       uint32_t *clear_addr = state + isl_dev->ss.clear_color_state_offset;
1218       blorp_surface_reloc(batch, state_offset +
1219                           isl_dev->ss.clear_color_state_offset,
1220                           op_clear_addr, *clear_addr);
1221 #else
1222       /* Fast clears just whack the AUX surface and don't actually use the
1223        * clear color for anything.  We can avoid the MI memcpy on that case.
1224        */
1225       if (aux_op != ISL_AUX_OP_FAST_CLEAR) {
1226          struct blorp_address dst_addr = blorp_get_surface_base_address(batch);
1227          dst_addr.offset += state_offset + isl_dev->ss.clear_value_offset;
1228          blorp_emit_memcpy(batch, dst_addr, surface->clear_color_addr,
1229                            isl_dev->ss.clear_value_size);
1230       }
1231 #endif
1232    }
1233 
1234    blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
1235 }
1236 
1237 static void
blorp_emit_null_surface_state(struct blorp_batch * batch,const struct blorp_surface_info * surface,uint32_t * state)1238 blorp_emit_null_surface_state(struct blorp_batch *batch,
1239                               const struct blorp_surface_info *surface,
1240                               uint32_t *state)
1241 {
1242    struct GENX(RENDER_SURFACE_STATE) ss = {
1243       .SurfaceType = SURFTYPE_NULL,
1244       .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
1245       .Width = surface->surf.logical_level0_px.width - 1,
1246       .Height = surface->surf.logical_level0_px.height - 1,
1247       .MIPCountLOD = surface->view.base_level,
1248       .MinimumArrayElement = surface->view.base_array_layer,
1249       .Depth = surface->view.array_len - 1,
1250       .RenderTargetViewExtent = surface->view.array_len - 1,
1251       .NumberofMultisamples = ffs(surface->surf.samples) - 1,
1252       .MOCS = isl_mocs(batch->blorp->isl_dev, 0, false),
1253 
1254       .SurfaceArray = surface->surf.dim != ISL_SURF_DIM_3D,
1255 
1256 #if GFX_VERx10 >= 125
1257       .TileMode = TILE4,
1258 #else
1259       .TileMode = YMAJOR,
1260 #endif
1261    };
1262 
1263    GENX(RENDER_SURFACE_STATE_pack)(NULL, state, &ss);
1264 
1265    blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
1266 }
1267 
1268 static uint32_t
blorp_setup_binding_table(struct blorp_batch * batch,const struct blorp_params * params)1269 blorp_setup_binding_table(struct blorp_batch *batch,
1270                            const struct blorp_params *params)
1271 {
1272    const struct isl_device *isl_dev = batch->blorp->isl_dev;
1273    uint32_t surface_offsets[2], bind_offset = 0;
1274    void *surface_maps[2];
1275 
1276    if (params->use_pre_baked_binding_table) {
1277       bind_offset = params->pre_baked_binding_table_offset;
1278    } else {
1279       unsigned num_surfaces = 1 + params->src.enabled;
1280       if (!blorp_alloc_binding_table(batch, num_surfaces,
1281                                      isl_dev->ss.size, isl_dev->ss.align,
1282                                      &bind_offset, surface_offsets, surface_maps))
1283          return 0;
1284 
1285       if (params->dst.enabled) {
1286          blorp_emit_surface_state(batch, &params->dst,
1287                                   params->fast_clear_op,
1288                                   surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
1289                                   surface_offsets[BLORP_RENDERBUFFER_BT_INDEX],
1290                                   params->color_write_disable, true);
1291       } else {
1292          assert(params->depth.enabled || params->stencil.enabled);
1293          const struct blorp_surface_info *surface =
1294             params->depth.enabled ? &params->depth : &params->stencil;
1295          blorp_emit_null_surface_state(batch, surface,
1296                                        surface_maps[BLORP_RENDERBUFFER_BT_INDEX]);
1297       }
1298 
1299       if (params->src.enabled) {
1300          blorp_emit_surface_state(batch, &params->src,
1301                                   params->fast_clear_op,
1302                                   surface_maps[BLORP_TEXTURE_BT_INDEX],
1303                                   surface_offsets[BLORP_TEXTURE_BT_INDEX],
1304                                   0, false);
1305       }
1306    }
1307 
1308    return bind_offset;
1309 }
1310 
1311 static void
blorp_emit_btp(struct blorp_batch * batch,uint32_t bind_offset)1312 blorp_emit_btp(struct blorp_batch *batch, uint32_t bind_offset)
1313 {
1314    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), bt);
1315    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_HS), bt);
1316    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_DS), bt);
1317    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_GS), bt);
1318 
1319    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), bt) {
1320       bt.PointertoPSBindingTable =
1321          blorp_binding_table_offset_to_pointer(batch, bind_offset);
1322    }
1323 }
1324 
1325 static void
blorp_emit_depth_stencil_config(struct blorp_batch * batch,const struct blorp_params * params)1326 blorp_emit_depth_stencil_config(struct blorp_batch *batch,
1327                                 const struct blorp_params *params)
1328 {
1329    const struct isl_device *isl_dev = batch->blorp->isl_dev;
1330    const struct intel_device_info *devinfo =
1331       batch->blorp->compiler->brw->devinfo;
1332 
1333    uint32_t *dw = blorp_emit_dwords(batch, isl_dev->ds.size / 4);
1334    if (dw == NULL)
1335       return;
1336 
1337    struct isl_depth_stencil_hiz_emit_info info = { };
1338 
1339    if (params->depth.enabled) {
1340       info.view = &params->depth.view;
1341       info.mocs = params->depth.addr.mocs;
1342    } else if (params->stencil.enabled) {
1343       info.view = &params->stencil.view;
1344       info.mocs = params->stencil.addr.mocs;
1345    } else {
1346       info.mocs = isl_mocs(isl_dev, 0, false);
1347    }
1348 
1349    if (params->depth.enabled) {
1350       info.depth_surf = &params->depth.surf;
1351 
1352       info.depth_address =
1353          blorp_emit_reloc(batch, dw + isl_dev->ds.depth_offset / 4,
1354                           params->depth.addr, 0);
1355 
1356       info.hiz_usage = params->depth.aux_usage;
1357       if (isl_aux_usage_has_hiz(info.hiz_usage)) {
1358          info.hiz_surf = &params->depth.aux_surf;
1359 
1360          struct blorp_address hiz_address = params->depth.aux_addr;
1361 
1362          info.hiz_address =
1363             blorp_emit_reloc(batch, dw + isl_dev->ds.hiz_offset / 4,
1364                              hiz_address, 0);
1365 
1366          info.depth_clear_value = params->depth.clear_color.f32[0];
1367       }
1368    }
1369 
1370    if (params->stencil.enabled) {
1371       info.stencil_surf = &params->stencil.surf;
1372 
1373       info.stencil_aux_usage = params->stencil.aux_usage;
1374       struct blorp_address stencil_address = params->stencil.addr;
1375 
1376       info.stencil_address =
1377          blorp_emit_reloc(batch, dw + isl_dev->ds.stencil_offset / 4,
1378                           stencil_address, 0);
1379    }
1380 
1381    isl_emit_depth_stencil_hiz_s(isl_dev, dw, &info);
1382 
1383    if (intel_needs_workaround(devinfo, 1408224581) ||
1384        intel_needs_workaround(devinfo, 14014097488) ||
1385        intel_needs_workaround(devinfo, 14016712196)) {
1386       /* Wa_1408224581
1387        *
1388        * Workaround: Gfx12LP Astep only An additional pipe control with
1389        * post-sync = store dword operation would be required.( w/a is to
1390        * have an additional pipe control after the stencil state whenever
1391        * the surface state bits of this state is changing).
1392        *
1393        * This also seems sufficient to handle Wa_14014097488 and
1394        * Wa_14016712196.
1395        */
1396       blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1397          pc.PostSyncOperation = WriteImmediateData;
1398          pc.Address = blorp_get_workaround_address(batch);
1399       }
1400    }
1401 }
1402 
1403 /* Emits the Optimized HiZ sequence specified in the BDW+ PRMs. The
1404  * depth/stencil buffer extents are ignored to handle APIs which perform
1405  * clearing operations without such information.
1406  * */
1407 static void
blorp_emit_gfx8_hiz_op(struct blorp_batch * batch,const struct blorp_params * params)1408 blorp_emit_gfx8_hiz_op(struct blorp_batch *batch,
1409                        const struct blorp_params *params)
1410 {
1411    /* We should be performing an operation on a depth or stencil buffer.
1412     */
1413    assert(params->depth.enabled || params->stencil.enabled);
1414 
1415    blorp_measure_start(batch, params);
1416 
1417    /* The stencil buffer should only be enabled if a fast clear operation is
1418     * requested.
1419     */
1420    if (params->stencil.enabled)
1421       assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR);
1422 
1423    /* From the BDW PRM Volume 2, 3DSTATE_WM_HZ_OP:
1424     *
1425     * 3DSTATE_MULTISAMPLE packet must be used prior to this packet to change
1426     * the Number of Multisamples. This packet must not be used to change
1427     * Number of Multisamples in a rendering sequence.
1428     *
1429     * Since HIZ may be the first thing in a batch buffer, play safe and always
1430     * emit 3DSTATE_MULTISAMPLE.
1431     */
1432    blorp_emit_3dstate_multisample(batch, params);
1433 
1434    /* From the BDW PRM Volume 7, Depth Buffer Clear:
1435     *
1436     *    The clear value must be between the min and max depth values
1437     *    (inclusive) defined in the CC_VIEWPORT. If the depth buffer format is
1438     *    D32_FLOAT, then +/-DENORM values are also allowed.
1439     *
1440     * Set the bounds to match our hardware limits.
1441     */
1442    if (params->depth.enabled && params->hiz_op == ISL_AUX_OP_FAST_CLEAR)
1443       blorp_emit_cc_viewport(batch);
1444 
1445    /* According to the SKL PRM formula for WM_INT::ThreadDispatchEnable, the
1446     * 3DSTATE_WM::ForceThreadDispatchEnable field can force WM thread dispatch
1447     * even when WM_HZ_OP is active.  However, WM thread dispatch is normally
1448     * disabled for HiZ ops and it appears that force-enabling it can lead to
1449     * GPU hangs on at least Skylake.  Since we don't know the current state of
1450     * the 3DSTATE_WM packet, just emit a dummy one prior to 3DSTATE_WM_HZ_OP.
1451     */
1452    blorp_emit(batch, GENX(3DSTATE_WM), wm);
1453 
1454    /* If we can't alter the depth stencil config and multiple layers are
1455     * involved, the HiZ op will fail. This is because the op requires that a
1456     * new config is emitted for each additional layer.
1457     */
1458    if (batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL) {
1459       assert(params->num_layers <= 1);
1460    } else {
1461       blorp_emit_depth_stencil_config(batch, params);
1462    }
1463 
1464    /* TODO - If we ever start using 3DSTATE_WM_HZ_OP::StencilBufferResolveEnable
1465     * we need to implement required steps, flushes documented in Wa_1605967699.
1466     */
1467    blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp) {
1468       switch (params->hiz_op) {
1469       case ISL_AUX_OP_FAST_CLEAR:
1470          hzp.StencilBufferClearEnable = params->stencil.enabled;
1471          hzp.DepthBufferClearEnable = params->depth.enabled;
1472          hzp.StencilClearValue = params->stencil_ref;
1473          hzp.FullSurfaceDepthandStencilClear = params->full_surface_hiz_op;
1474 #if GFX_VER >= 20
1475          hzp.DepthClearValue = params->depth.clear_color.f32[0];
1476 
1477          /* From the Xe2 Bspec 56437 (r61349):
1478           *
1479           *    The Depth Clear value cannot be a NAN (Not-A-Number) if the
1480           *    depth format is Float32.
1481           *
1482           * We're not required to support NaN in APIs, so flush to zero.
1483           */
1484          if (util_is_nan(hzp.DepthClearValue))
1485             hzp.DepthClearValue = 0;
1486 #endif
1487          break;
1488       case ISL_AUX_OP_FULL_RESOLVE:
1489          assert(params->full_surface_hiz_op);
1490          hzp.DepthBufferResolveEnable = true;
1491          break;
1492       case ISL_AUX_OP_AMBIGUATE:
1493          assert(params->full_surface_hiz_op);
1494          hzp.HierarchicalDepthBufferResolveEnable = true;
1495          break;
1496       case ISL_AUX_OP_PARTIAL_RESOLVE:
1497       case ISL_AUX_OP_NONE:
1498          unreachable("Invalid HIZ op");
1499       }
1500 
1501       hzp.NumberofMultisamples = ffs(params->num_samples) - 1;
1502       hzp.SampleMask = 0xFFFF;
1503 
1504       /* Due to a hardware issue, this bit MBZ */
1505       assert(hzp.ScissorRectangleEnable == false);
1506 
1507       /* Contrary to the HW docs both fields are inclusive */
1508       hzp.ClearRectangleXMin = params->x0;
1509       hzp.ClearRectangleYMin = params->y0;
1510 
1511       /* Contrary to the HW docs both fields are exclusive */
1512       hzp.ClearRectangleXMax = params->x1;
1513       hzp.ClearRectangleYMax = params->y1;
1514    }
1515 
1516    /* PIPE_CONTROL w/ all bits clear except for “Post-Sync Operation” must set
1517     * to “Write Immediate Data” enabled.
1518     */
1519    blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1520       pc.PostSyncOperation = WriteImmediateData;
1521       pc.Address = blorp_get_workaround_address(batch);
1522    }
1523 
1524    blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp);
1525 
1526    blorp_measure_end(batch, params);
1527 }
1528 
1529 static bool
blorp_uses_bti_rt_writes(const struct blorp_batch * batch,const struct blorp_params * params)1530 blorp_uses_bti_rt_writes(const struct blorp_batch *batch, const struct blorp_params *params)
1531 {
1532    if (batch->flags & (BLORP_BATCH_USE_BLITTER | BLORP_BATCH_USE_COMPUTE))
1533       return false;
1534 
1535    /* HIZ clears use WM_HZ ops rather than a clear shader using RT writes. */
1536    return params->hiz_op == ISL_AUX_OP_NONE;
1537 }
1538 
1539 static void
blorp_exec_3d(struct blorp_batch * batch,const struct blorp_params * params)1540 blorp_exec_3d(struct blorp_batch *batch, const struct blorp_params *params)
1541 {
1542    if (params->hiz_op != ISL_AUX_OP_NONE) {
1543       blorp_emit_gfx8_hiz_op(batch, params);
1544       return;
1545    }
1546 
1547    blorp_emit_vertex_buffers(batch, params);
1548    blorp_emit_vertex_elements(batch, params);
1549 
1550    blorp_emit_pipeline(batch, params);
1551 
1552    blorp_emit_btp(batch, blorp_setup_binding_table(batch, params));
1553 
1554    if (!(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
1555       blorp_emit_depth_stencil_config(batch, params);
1556 
1557    const UNUSED bool use_tbimr = false;
1558    blorp_emit_pre_draw(batch, params);
1559    blorp_emit(batch, GENX(3DPRIMITIVE), prim) {
1560       prim.VertexAccessType = SEQUENTIAL;
1561       prim.PrimitiveTopologyType = _3DPRIM_RECTLIST;
1562       prim.PredicateEnable = batch->flags & BLORP_BATCH_PREDICATE_ENABLE;
1563 #if GFX_VERx10 >= 125
1564       prim.TBIMREnable = use_tbimr;
1565 #endif
1566       prim.VertexCountPerInstance = 3;
1567       prim.InstanceCount = params->num_layers;
1568    }
1569    blorp_emit_post_draw(batch, params);
1570 }
1571 
1572 static void
blorp_get_compute_push_const(struct blorp_batch * batch,const struct blorp_params * params,uint32_t threads,uint32_t * state_offset,unsigned * state_size)1573 blorp_get_compute_push_const(struct blorp_batch *batch,
1574                              const struct blorp_params *params,
1575                              uint32_t threads,
1576                              uint32_t *state_offset,
1577                              unsigned *state_size)
1578 {
1579    const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
1580    const unsigned push_const_size =
1581       ALIGN(brw_cs_push_const_total_size(cs_prog_data, threads), 64);
1582    assert(cs_prog_data->push.cross_thread.size +
1583           cs_prog_data->push.per_thread.size == sizeof(params->wm_inputs));
1584 
1585    if (push_const_size == 0) {
1586       *state_offset = 0;
1587       *state_size = 0;
1588       return;
1589    }
1590 
1591    uint32_t push_const_offset;
1592    uint32_t *push_const =
1593       GFX_VERx10 >= 125 ?
1594       blorp_alloc_general_state(batch, push_const_size, 64,
1595                                 &push_const_offset) :
1596       blorp_alloc_dynamic_state(batch, push_const_size, 64,
1597                                 &push_const_offset);
1598    if (push_const == NULL) {
1599       *state_offset = 0;
1600       *state_size = 0;
1601       return;
1602    }
1603    memset(push_const, 0x0, push_const_size);
1604 
1605    void *dst = push_const;
1606    const void *src = (char *)&params->wm_inputs;
1607 
1608    if (cs_prog_data->push.cross_thread.size > 0) {
1609       memcpy(dst, src, cs_prog_data->push.cross_thread.size);
1610       dst += cs_prog_data->push.cross_thread.size;
1611       src += cs_prog_data->push.cross_thread.size;
1612    }
1613 
1614    assert(GFX_VERx10 < 125 || cs_prog_data->push.per_thread.size == 0);
1615 #if GFX_VERx10 < 125
1616    if (cs_prog_data->push.per_thread.size > 0) {
1617       for (unsigned t = 0; t < threads; t++) {
1618          memcpy(dst, src, (cs_prog_data->push.per_thread.dwords - 1) * 4);
1619 
1620          uint32_t *subgroup_id = dst + cs_prog_data->push.per_thread.size - 4;
1621          *subgroup_id = t;
1622 
1623          dst += cs_prog_data->push.per_thread.size;
1624       }
1625    }
1626 #endif
1627 
1628    *state_offset = push_const_offset;
1629    *state_size = push_const_size;
1630 }
1631 
1632 static void
blorp_exec_compute(struct blorp_batch * batch,const struct blorp_params * params)1633 blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
1634 {
1635    assert(!(batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
1636    assert(params->hiz_op == ISL_AUX_OP_NONE);
1637 
1638    blorp_measure_start(batch, params);
1639 
1640    const struct intel_device_info *devinfo = batch->blorp->compiler->brw->devinfo;
1641    const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
1642    const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
1643    const struct intel_cs_dispatch_info dispatch =
1644       brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
1645 
1646    uint32_t group_x0 = params->x0 / cs_prog_data->local_size[0];
1647    uint32_t group_y0 = params->y0 / cs_prog_data->local_size[1];
1648    uint32_t group_z0 = params->dst.z_offset;
1649    uint32_t group_x1 = DIV_ROUND_UP(params->x1, cs_prog_data->local_size[0]);
1650    uint32_t group_y1 = DIV_ROUND_UP(params->y1, cs_prog_data->local_size[1]);
1651    assert(params->num_layers >= 1);
1652    uint32_t group_z1 = params->dst.z_offset + params->num_layers;
1653    assert(cs_prog_data->local_size[2] == 1);
1654 
1655 #if GFX_VERx10 >= 125
1656    assert(cs_prog_data->push.per_thread.regs == 0);
1657    blorp_emit(batch, GENX(COMPUTE_WALKER), cw) {
1658       cw.SIMDSize                       = dispatch.simd_size / 16;
1659       cw.MessageSIMD                    = dispatch.simd_size / 16,
1660       cw.LocalXMaximum                  = cs_prog_data->local_size[0] - 1;
1661       cw.LocalYMaximum                  = cs_prog_data->local_size[1] - 1;
1662       cw.LocalZMaximum                  = cs_prog_data->local_size[2] - 1;
1663       cw.ThreadGroupIDStartingX         = group_x0;
1664       cw.ThreadGroupIDStartingY         = group_y0;
1665       cw.ThreadGroupIDStartingZ         = group_z0;
1666       cw.ThreadGroupIDXDimension        = group_x1;
1667       cw.ThreadGroupIDYDimension        = group_y1;
1668       cw.ThreadGroupIDZDimension        = group_z1;
1669       cw.ExecutionMask                  = 0xffffffff;
1670       cw.PostSync.MOCS                  = isl_mocs(batch->blorp->isl_dev, 0, false);
1671 
1672       uint32_t surfaces_offset = blorp_setup_binding_table(batch, params);
1673 
1674       uint32_t samplers_offset =
1675          params->src.enabled ? blorp_emit_sampler_state(batch) : 0;
1676 
1677       uint32_t push_const_offset;
1678       unsigned push_const_size;
1679       blorp_get_compute_push_const(batch, params, dispatch.threads,
1680                                    &push_const_offset, &push_const_size);
1681       cw.IndirectDataStartAddress       = push_const_offset;
1682       cw.IndirectDataLength             = push_const_size;
1683 
1684 #if GFX_VERx10 >= 125
1685       cw.GenerateLocalID                = cs_prog_data->generate_local_id != 0;
1686       cw.EmitLocal                      = cs_prog_data->generate_local_id;
1687       cw.WalkOrder                      = cs_prog_data->walk_order;
1688       cw.TileLayout = cs_prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
1689                       TileY32bpe : Linear;
1690 #endif
1691 
1692       cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
1693          .KernelStartPointer = params->cs_prog_kernel,
1694          .SamplerStatePointer = samplers_offset,
1695          .SamplerCount = params->src.enabled ? 1 : 0,
1696          .BindingTableEntryCount = params->src.enabled ? 2 : 1,
1697          .BindingTablePointer = surfaces_offset,
1698          .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
1699          .SharedLocalMemorySize =
1700             intel_compute_slm_encode_size(GFX_VER, prog_data->total_shared),
1701          .PreferredSLMAllocationSize =
1702             intel_compute_preferred_slm_calc_encode_size(devinfo,
1703                                                          prog_data->total_shared,
1704                                                          dispatch.group_size,
1705                                                          dispatch.simd_size),
1706          .NumberOfBarriers = cs_prog_data->uses_barrier,
1707       };
1708    }
1709 
1710 #else
1711 
1712    /* The MEDIA_VFE_STATE documentation for Gfx8+ says:
1713     *
1714     * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
1715     *  the only bits that are changed are scoreboard related: Scoreboard
1716     *  Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
1717     *  these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient."
1718     *
1719     * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL",
1720     * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL.
1721     */
1722    blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1723       pc.CommandStreamerStallEnable = true;
1724       pc.StallAtPixelScoreboard = true;
1725    }
1726 
1727    blorp_emit(batch, GENX(MEDIA_VFE_STATE), vfe) {
1728       assert(prog_data->total_scratch == 0);
1729       vfe.MaximumNumberofThreads =
1730          devinfo->max_cs_threads * devinfo->subslice_total - 1;
1731       vfe.NumberofURBEntries = 2;
1732 #if GFX_VER < 11
1733       vfe.ResetGatewayTimer =
1734          Resettingrelativetimerandlatchingtheglobaltimestamp;
1735 #endif
1736       vfe.URBEntryAllocationSize = 2;
1737 
1738       const uint32_t vfe_curbe_allocation =
1739          ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
1740                cs_prog_data->push.cross_thread.regs, 2);
1741       vfe.CURBEAllocationSize = vfe_curbe_allocation;
1742    }
1743 
1744    uint32_t push_const_offset;
1745    unsigned push_const_size;
1746    blorp_get_compute_push_const(batch, params, dispatch.threads,
1747                                 &push_const_offset, &push_const_size);
1748 
1749    blorp_emit(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
1750       curbe.CURBETotalDataLength = push_const_size;
1751       curbe.CURBEDataStartAddress = push_const_offset;
1752    }
1753 
1754    uint32_t surfaces_offset = blorp_setup_binding_table(batch, params);
1755 
1756    uint32_t samplers_offset =
1757       params->src.enabled ? blorp_emit_sampler_state(batch) : 0;
1758 
1759    struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
1760       .KernelStartPointer = params->cs_prog_kernel,
1761       .SamplerStatePointer = samplers_offset,
1762       .SamplerCount = params->src.enabled ? 1 : 0,
1763       .BindingTableEntryCount = params->src.enabled ? 2 : 1,
1764       .BindingTablePointer = surfaces_offset,
1765       .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
1766       .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
1767       .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER,
1768                                                              prog_data->total_shared),
1769       .BarrierEnable = cs_prog_data->uses_barrier,
1770       .CrossThreadConstantDataReadLength =
1771          cs_prog_data->push.cross_thread.regs,
1772    };
1773 
1774    uint32_t idd_offset;
1775    uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
1776    void *state = blorp_alloc_dynamic_state(batch, size, 64, &idd_offset);
1777    if (state == NULL)
1778       return;
1779    GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, state, &idd);
1780 
1781    blorp_emit(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
1782       mid.InterfaceDescriptorTotalLength        = size;
1783       mid.InterfaceDescriptorDataStartAddress   = idd_offset;
1784    }
1785 
1786    blorp_emit(batch, GENX(GPGPU_WALKER), ggw) {
1787       ggw.SIMDSize                     = dispatch.simd_size / 16;
1788       ggw.ThreadDepthCounterMaximum    = 0;
1789       ggw.ThreadHeightCounterMaximum   = 0;
1790       ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
1791       ggw.ThreadGroupIDStartingX       = group_x0;
1792       ggw.ThreadGroupIDStartingY       = group_y0;
1793       ggw.ThreadGroupIDStartingResumeZ = group_z0;
1794       ggw.ThreadGroupIDXDimension      = group_x1;
1795       ggw.ThreadGroupIDYDimension      = group_y1;
1796       ggw.ThreadGroupIDZDimension      = group_z1;
1797       ggw.RightExecutionMask           = dispatch.right_mask;
1798       ggw.BottomExecutionMask          = 0xffffffff;
1799    }
1800 
1801 #endif
1802 
1803    blorp_measure_end(batch, params);
1804 }
1805 
1806 /* -----------------------------------------------------------------------
1807  * -- BLORP on blitter
1808  * -----------------------------------------------------------------------
1809  */
1810 
1811 #include "isl/isl_genX_helpers.h"
1812 
1813 #if GFX_VER >= 12
1814 static uint32_t
xy_bcb_tiling(const struct isl_surf * surf)1815 xy_bcb_tiling(const struct isl_surf *surf)
1816 {
1817    switch (surf->tiling) {
1818    case ISL_TILING_LINEAR:
1819       return XY_TILE_LINEAR;
1820 #if GFX_VERx10 >= 125
1821    case ISL_TILING_X:
1822       return XY_TILE_X;
1823    case ISL_TILING_4:
1824       return XY_TILE_4;
1825    case ISL_TILING_64:
1826    case ISL_TILING_64_XE2:
1827       return XY_TILE_64;
1828 #else
1829    case ISL_TILING_Y0:
1830       return XY_TILE_Y;
1831 #endif
1832    default:
1833       unreachable("Invalid tiling for XY_BLOCK_COPY_BLT");
1834    }
1835 }
1836 
1837 static uint32_t
xy_color_depth(const struct isl_format_layout * fmtl)1838 xy_color_depth(const struct isl_format_layout *fmtl)
1839 {
1840    switch (fmtl->bpb) {
1841    case 128: return XY_BPP_128_BIT;
1842    case  96: return XY_BPP_96_BIT;
1843    case  64: return XY_BPP_64_BIT;
1844    case  32: return XY_BPP_32_BIT;
1845    case  16: return XY_BPP_16_BIT;
1846    case   8: return XY_BPP_8_BIT;
1847    default:
1848       unreachable("Invalid bpp");
1849    }
1850 }
1851 #endif
1852 
1853 #if GFX_VERx10 >= 125
1854 static uint32_t
xy_bcb_surf_dim(const struct isl_surf * surf)1855 xy_bcb_surf_dim(const struct isl_surf *surf)
1856 {
1857    switch (surf->dim) {
1858    case ISL_SURF_DIM_1D:
1859       return XY_SURFTYPE_1D;
1860    case ISL_SURF_DIM_2D:
1861       return XY_SURFTYPE_2D;
1862    case ISL_SURF_DIM_3D:
1863       return XY_SURFTYPE_3D;
1864    default:
1865       unreachable("Invalid dimensionality for XY_BLOCK_COPY_BLT");
1866    }
1867 }
1868 
1869 static uint32_t
xy_bcb_surf_depth(const struct isl_surf * surf)1870 xy_bcb_surf_depth(const struct isl_surf *surf)
1871 {
1872    return surf->dim == ISL_SURF_DIM_3D ? surf->logical_level0_px.depth
1873                                        : surf->logical_level0_px.array_len;
1874 }
1875 
1876 #if GFX_VER < 20
1877 static uint32_t
xy_aux_mode(const struct blorp_surface_info * info)1878 xy_aux_mode(const struct blorp_surface_info *info)
1879 {
1880    switch (info->aux_usage) {
1881    case ISL_AUX_USAGE_CCS_E:
1882    case ISL_AUX_USAGE_FCV_CCS_E:
1883    case ISL_AUX_USAGE_STC_CCS:
1884       return XY_CCS_E;
1885    case ISL_AUX_USAGE_NONE:
1886       return XY_NONE;
1887    default:
1888       unreachable("Unsupported aux mode");
1889    }
1890 }
1891 #endif // GFX_VER < 20
1892 #endif // GFX_VERx10 >= 125
1893 
1894 UNUSED static void
blorp_xy_block_copy_blt(struct blorp_batch * batch,const struct blorp_params * params)1895 blorp_xy_block_copy_blt(struct blorp_batch *batch,
1896                         const struct blorp_params *params)
1897 {
1898 #if GFX_VER < 12
1899    unreachable("Blitter is only supported on Gfx12+");
1900 #else
1901    UNUSED const struct isl_device *isl_dev = batch->blorp->isl_dev;
1902 
1903    assert(batch->flags & BLORP_BATCH_USE_BLITTER);
1904    assert(!(batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
1905    assert(params->hiz_op == ISL_AUX_OP_NONE);
1906 
1907    assert(params->num_layers == 1);
1908    assert(params->dst.view.levels == 1);
1909    assert(params->src.view.levels == 1);
1910 
1911 #if GFX_VERx10 < 125
1912    assert(params->dst.view.base_array_layer == 0);
1913    assert(params->dst.z_offset == 0);
1914 #endif
1915 
1916    unsigned dst_x0 = params->x0;
1917    unsigned dst_x1 = params->x1;
1918    unsigned src_x0 =
1919       dst_x0 - params->wm_inputs.coord_transform[0].offset;
1920    ASSERTED unsigned src_x1 =
1921       dst_x1 - params->wm_inputs.coord_transform[0].offset;
1922    unsigned dst_y0 = params->y0;
1923    unsigned dst_y1 = params->y1;
1924    unsigned src_y0 =
1925       dst_y0 - params->wm_inputs.coord_transform[1].offset;
1926    ASSERTED unsigned src_y1 =
1927       dst_y1 - params->wm_inputs.coord_transform[1].offset;
1928 
1929    assert(src_x1 - src_x0 == dst_x1 - dst_x0);
1930    assert(src_y1 - src_y0 == dst_y1 - dst_y0);
1931 
1932    const struct isl_surf *src_surf = &params->src.surf;
1933    const struct isl_surf *dst_surf = &params->dst.surf;
1934 
1935    const struct isl_format_layout *fmtl =
1936       isl_format_get_layout(params->dst.view.format);
1937 
1938    if (fmtl->bpb == 96) {
1939       assert(src_surf->tiling == ISL_TILING_LINEAR &&
1940              dst_surf->tiling == ISL_TILING_LINEAR);
1941    }
1942 
1943    assert(src_surf->samples == 1);
1944    assert(dst_surf->samples == 1);
1945 
1946    unsigned dst_pitch_unit = dst_surf->tiling == ISL_TILING_LINEAR ? 1 : 4;
1947    unsigned src_pitch_unit = src_surf->tiling == ISL_TILING_LINEAR ? 1 : 4;
1948 
1949 #if GFX_VERx10 >= 125
1950    struct isl_extent3d src_align = isl_get_image_alignment(src_surf);
1951    struct isl_extent3d dst_align = isl_get_image_alignment(dst_surf);
1952 #endif
1953 
1954    blorp_emit(batch, GENX(XY_BLOCK_COPY_BLT), blt) {
1955       blt.ColorDepth = xy_color_depth(fmtl);
1956 
1957       blt.DestinationPitch = (dst_surf->row_pitch_B / dst_pitch_unit) - 1;
1958       blt.DestinationMOCS = params->dst.addr.mocs;
1959       blt.DestinationTiling = xy_bcb_tiling(dst_surf);
1960       blt.DestinationX1 = dst_x0;
1961       blt.DestinationY1 = dst_y0;
1962       blt.DestinationX2 = dst_x1;
1963       blt.DestinationY2 = dst_y1;
1964       blt.DestinationBaseAddress = params->dst.addr;
1965       blt.DestinationXOffset = params->dst.tile_x_sa;
1966       blt.DestinationYOffset = params->dst.tile_y_sa;
1967 
1968 #if GFX_VERx10 >= 125
1969       blt.DestinationSurfaceType = xy_bcb_surf_dim(dst_surf);
1970       blt.DestinationSurfaceWidth = dst_surf->logical_level0_px.w - 1;
1971       blt.DestinationSurfaceHeight = dst_surf->logical_level0_px.h - 1;
1972       blt.DestinationSurfaceDepth = xy_bcb_surf_depth(dst_surf) - 1;
1973       blt.DestinationArrayIndex =
1974          params->dst.view.base_array_layer + params->dst.z_offset;
1975       blt.DestinationSurfaceQPitch = isl_get_qpitch(dst_surf) >> 2;
1976       blt.DestinationLOD = params->dst.view.base_level;
1977       blt.DestinationMipTailStartLOD = dst_surf->miptail_start_level;
1978       blt.DestinationHorizontalAlign = isl_encode_halign(dst_align.width);
1979       blt.DestinationVerticalAlign = isl_encode_valign(dst_align.height);
1980 #if GFX_VER < 20
1981       /* XY_BLOCK_COPY_BLT only supports AUX_CCS. */
1982       blt.DestinationDepthStencilResource =
1983          params->dst.aux_usage == ISL_AUX_USAGE_STC_CCS;
1984 #endif
1985       blt.DestinationTargetMemory =
1986          params->dst.addr.local_hint ? XY_MEM_LOCAL : XY_MEM_SYSTEM;
1987 
1988       if (params->dst.aux_usage != ISL_AUX_USAGE_NONE) {
1989 #if GFX_VER < 20
1990          blt.DestinationAuxiliarySurfaceMode = xy_aux_mode(&params->dst);
1991          blt.DestinationCompressionEnable = true;
1992 #endif
1993          blt.DestinationCompressionFormat =
1994             isl_get_render_compression_format(dst_surf->format);
1995          blt.DestinationClearValueEnable = !!params->dst.clear_color_addr.buffer;
1996          blt.DestinationClearAddress = params->dst.clear_color_addr;
1997       }
1998 #endif
1999 
2000       blt.SourceX1 = src_x0;
2001       blt.SourceY1 = src_y0;
2002       blt.SourcePitch = (src_surf->row_pitch_B / src_pitch_unit) - 1;
2003       blt.SourceMOCS = params->src.addr.mocs;
2004       blt.SourceTiling = xy_bcb_tiling(src_surf);
2005       blt.SourceBaseAddress = params->src.addr;
2006       blt.SourceXOffset = params->src.tile_x_sa;
2007       blt.SourceYOffset = params->src.tile_y_sa;
2008 
2009 #if GFX_VERx10 >= 125
2010       blt.SourceSurfaceType = xy_bcb_surf_dim(src_surf);
2011       blt.SourceSurfaceWidth = src_surf->logical_level0_px.w - 1;
2012       blt.SourceSurfaceHeight = src_surf->logical_level0_px.h - 1;
2013       blt.SourceSurfaceDepth = xy_bcb_surf_depth(src_surf) - 1;
2014       blt.SourceArrayIndex =
2015          params->src.view.base_array_layer + params->src.z_offset;
2016       blt.SourceSurfaceQPitch = isl_get_qpitch(src_surf) >> 2;
2017       blt.SourceLOD = params->src.view.base_level;
2018       blt.SourceMipTailStartLOD = src_surf->miptail_start_level;
2019       blt.SourceHorizontalAlign = isl_encode_halign(src_align.width);
2020       blt.SourceVerticalAlign = isl_encode_valign(src_align.height);
2021 #if GFX_VER < 20
2022       /* XY_BLOCK_COPY_BLT only supports AUX_CCS. */
2023       blt.SourceDepthStencilResource =
2024          params->src.aux_usage == ISL_AUX_USAGE_STC_CCS;
2025 #endif
2026       blt.SourceTargetMemory =
2027          params->src.addr.local_hint ? XY_MEM_LOCAL : XY_MEM_SYSTEM;
2028 
2029       if (params->src.aux_usage != ISL_AUX_USAGE_NONE) {
2030 #if GFX_VER < 20
2031          blt.SourceAuxiliarySurfaceMode = xy_aux_mode(&params->src);
2032          blt.SourceCompressionEnable = true;
2033 #endif
2034          blt.SourceCompressionFormat =
2035             isl_get_render_compression_format(src_surf->format);
2036          blt.SourceClearValueEnable = !!params->src.clear_color_addr.buffer;
2037          blt.SourceClearAddress = params->src.clear_color_addr;
2038       }
2039 #endif
2040    }
2041 #endif
2042 }
2043 
2044 UNUSED static void
blorp_xy_fast_color_blit(struct blorp_batch * batch,const struct blorp_params * params)2045 blorp_xy_fast_color_blit(struct blorp_batch *batch,
2046                          const struct blorp_params *params)
2047 {
2048 #if GFX_VER < 12
2049    unreachable("Blitter is only supported on Gfx12+");
2050 #else
2051    UNUSED const struct isl_device *isl_dev = batch->blorp->isl_dev;
2052    const struct isl_surf *dst_surf = &params->dst.surf;
2053    const struct isl_format_layout *fmtl =
2054       isl_format_get_layout(params->dst.view.format);
2055 
2056    assert(batch->flags & BLORP_BATCH_USE_BLITTER);
2057    assert(!(batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
2058    assert(params->hiz_op == ISL_AUX_OP_NONE);
2059 
2060    assert(params->num_layers == 1);
2061    assert(params->dst.view.levels == 1);
2062    assert(dst_surf->samples == 1);
2063    assert(fmtl->bpb != 96 || dst_surf->tiling == ISL_TILING_LINEAR);
2064 
2065 #if GFX_VERx10 < 125
2066    assert(params->dst.view.base_array_layer == 0);
2067    assert(params->dst.z_offset == 0);
2068 #endif
2069 
2070    unsigned dst_pitch_unit = dst_surf->tiling == ISL_TILING_LINEAR ? 1 : 4;
2071 
2072 #if GFX_VERx10 >= 125
2073    struct isl_extent3d dst_align = isl_get_image_alignment(dst_surf);
2074 #endif
2075 
2076 #if INTEL_NEEDS_WA_16021021469
2077    assert(fmtl->bpb != 96);
2078 #endif
2079 
2080    blorp_emit(batch, GENX(XY_FAST_COLOR_BLT), blt) {
2081       blt.ColorDepth = xy_color_depth(fmtl);
2082 
2083       blt.DestinationPitch = (dst_surf->row_pitch_B / dst_pitch_unit) - 1;
2084       blt.DestinationTiling = xy_bcb_tiling(dst_surf);
2085       blt.DestinationX1 = params->x0;
2086       blt.DestinationY1 = params->y0;
2087       blt.DestinationX2 = params->x1;
2088       blt.DestinationY2 = params->y1;
2089       blt.DestinationBaseAddress = params->dst.addr;
2090       blt.DestinationXOffset = params->dst.tile_x_sa;
2091       blt.DestinationYOffset = params->dst.tile_y_sa;
2092 
2093       isl_color_value_pack((union isl_color_value *)
2094                            params->wm_inputs.clear_color,
2095                            params->dst.view.format, blt.FillColor);
2096 
2097 #if GFX_VERx10 >= 125
2098       blt.DestinationSurfaceType = xy_bcb_surf_dim(dst_surf);
2099       blt.DestinationSurfaceWidth = dst_surf->logical_level0_px.w - 1;
2100       blt.DestinationSurfaceHeight = dst_surf->logical_level0_px.h - 1;
2101       blt.DestinationSurfaceDepth = xy_bcb_surf_depth(dst_surf) - 1;
2102       blt.DestinationArrayIndex =
2103          params->dst.view.base_array_layer + params->dst.z_offset;
2104       blt.DestinationSurfaceQPitch = isl_get_qpitch(dst_surf) >> 2;
2105       blt.DestinationLOD = params->dst.view.base_level;
2106       blt.DestinationMipTailStartLOD = dst_surf->miptail_start_level;
2107       blt.DestinationHorizontalAlign = isl_encode_halign(dst_align.width);
2108       blt.DestinationVerticalAlign = isl_encode_valign(dst_align.height);
2109       /* XY_FAST_COLOR_BLT only supports AUX_CCS. */
2110       blt.DestinationDepthStencilResource =
2111          params->dst.aux_usage == ISL_AUX_USAGE_STC_CCS;
2112       blt.DestinationTargetMemory =
2113          params->dst.addr.local_hint ? XY_MEM_LOCAL : XY_MEM_SYSTEM;
2114 
2115       if (params->dst.aux_usage != ISL_AUX_USAGE_NONE) {
2116 #if GFX_VERx10 == 125
2117          blt.DestinationAuxiliarySurfaceMode = xy_aux_mode(&params->dst);
2118          blt.DestinationCompressionEnable = true;
2119          blt.DestinationClearValueEnable = !!params->dst.clear_color_addr.buffer;
2120          blt.DestinationClearAddress = params->dst.clear_color_addr;
2121 #endif
2122          blt.DestinationCompressionFormat =
2123             isl_get_render_compression_format(dst_surf->format);
2124       }
2125 
2126       blt.DestinationMOCS = params->dst.addr.mocs;
2127 #endif
2128    }
2129 #endif
2130 }
2131 
2132 static void
blorp_exec_blitter(struct blorp_batch * batch,const struct blorp_params * params)2133 blorp_exec_blitter(struct blorp_batch *batch,
2134                    const struct blorp_params *params)
2135 {
2136    blorp_measure_start(batch, params);
2137 
2138    if (params->src.enabled)
2139       blorp_xy_block_copy_blt(batch, params);
2140    else
2141       blorp_xy_fast_color_blit(batch, params);
2142 
2143    blorp_measure_end(batch, params);
2144 }
2145 
2146 /**
2147  * \brief Execute a blit or render pass operation.
2148  *
2149  * To execute the operation, this function manually constructs and emits a
2150  * batch to draw a rectangle primitive. The batchbuffer is flushed before
2151  * constructing and after emitting the batch.
2152  *
2153  * This function alters no GL state.
2154  */
2155 static void
blorp_exec(struct blorp_batch * batch,const struct blorp_params * params)2156 blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
2157 {
2158    if (batch->flags & BLORP_BATCH_USE_BLITTER) {
2159       blorp_exec_blitter(batch, params);
2160    } else if (batch->flags & BLORP_BATCH_USE_COMPUTE) {
2161       blorp_exec_compute(batch, params);
2162    } else {
2163       blorp_exec_3d(batch, params);
2164    }
2165 }
2166 
2167 static void
blorp_init_dynamic_states(struct blorp_context * context)2168 blorp_init_dynamic_states(struct blorp_context *context)
2169 {
2170    {
2171       struct GENX(BLEND_STATE) blend = { };
2172 
2173       uint32_t dws[GENX(BLEND_STATE_length) * 4 +
2174                    GENX(BLEND_STATE_ENTRY_length) * 4 * 8 /* MAX_RTS */];
2175       uint32_t *pos = dws;
2176 
2177       GENX(BLEND_STATE_pack)(NULL, pos, &blend);
2178       pos += GENX(BLEND_STATE_length);
2179 
2180       for (unsigned i = 0; i < 8; ++i) {
2181          struct GENX(BLEND_STATE_ENTRY) entry = {
2182             .PreBlendColorClampEnable = true,
2183             .PostBlendColorClampEnable = true,
2184             .ColorClampRange = COLORCLAMP_RTFORMAT,
2185          };
2186          GENX(BLEND_STATE_ENTRY_pack)(NULL, pos, &entry);
2187          pos += GENX(BLEND_STATE_ENTRY_length);
2188       }
2189 
2190       context->upload_dynamic_state(context, dws, sizeof(dws), 64,
2191                                     BLORP_DYNAMIC_STATE_BLEND);
2192    }
2193 
2194    blorp_context_upload_dynamic(context, GENX(CC_VIEWPORT), vp, 32,
2195                                 BLORP_DYNAMIC_STATE_CC_VIEWPORT) {
2196       vp.MinimumDepth = context->config.use_unrestricted_depth_range ?
2197                         -FLT_MAX : 0.0;
2198       vp.MaximumDepth = context->config.use_unrestricted_depth_range ?
2199                         FLT_MAX : 1.0;
2200    }
2201 
2202    blorp_context_upload_dynamic(context, GENX(COLOR_CALC_STATE), cc, 64,
2203                                 BLORP_DYNAMIC_STATE_COLOR_CALC) {
2204       /* Nothing */
2205    }
2206 
2207    blorp_context_upload_dynamic(context, GENX(SAMPLER_STATE), sampler, 32,
2208                                 BLORP_DYNAMIC_STATE_SAMPLER) {
2209       sampler.MipModeFilter = MIPFILTER_NONE;
2210       sampler.MagModeFilter = MAPFILTER_LINEAR;
2211       sampler.MinModeFilter = MAPFILTER_LINEAR;
2212       sampler.MinLOD = 0;
2213       sampler.MaxLOD = 0;
2214       sampler.TCXAddressControlMode = TCM_CLAMP;
2215       sampler.TCYAddressControlMode = TCM_CLAMP;
2216       sampler.TCZAddressControlMode = TCM_CLAMP;
2217       sampler.MaximumAnisotropy = RATIO21;
2218       sampler.RAddressMinFilterRoundingEnable = true;
2219       sampler.RAddressMagFilterRoundingEnable = true;
2220       sampler.VAddressMinFilterRoundingEnable = true;
2221       sampler.VAddressMagFilterRoundingEnable = true;
2222       sampler.UAddressMinFilterRoundingEnable = true;
2223       sampler.UAddressMagFilterRoundingEnable = true;
2224       sampler.NonnormalizedCoordinateEnable = true;
2225    }
2226 }
2227 
2228 #endif /* BLORP_GENX_EXEC_BRW_H */
2229