xref: /aosp_15_r20/external/mesa3d/src/intel/blorp/blorp_genX_exec_elk.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #ifndef BLORP_GENX_EXEC_ELK_H
25 #define BLORP_GENX_EXEC_ELK_H
26 
27 #if GFX_VER > 8
28 #error "ELK doesn't support Gfx > 8."
29 #endif
30 
31 #include "blorp_priv.h"
32 #include "dev/intel_device_info.h"
33 #include "common/intel_compute_slm.h"
34 #include "common/intel_sample_positions.h"
35 #include "common/intel_l3_config.h"
36 #include "genxml/gen_macros.h"
37 #include "intel/compiler/elk/elk_compiler.h"
38 
39 /**
40  * This file provides the blorp pipeline setup and execution functionality.
41  * It defines the following function:
42  *
43  * static void
44  * blorp_exec(struct blorp_context *blorp, void *batch_data,
45  *            const struct blorp_params *params);
46  *
47  * It is the job of whoever includes this header to wrap this in something
48  * to get an externally visible symbol.
49  *
50  * In order for the blorp_exec function to work, the driver must provide
51  * implementations of the following static helper functions.
52  */
53 
54 static void *
55 blorp_emit_dwords(struct blorp_batch *batch, unsigned n);
56 
57 static uint64_t
58 blorp_emit_reloc(struct blorp_batch *batch,
59                  void *location, struct blorp_address address, uint32_t delta);
60 
61 static void
62 blorp_measure_start(struct blorp_batch *batch,
63                     const struct blorp_params *params);
64 
65 static void
66 blorp_measure_end(struct blorp_batch *batch,
67                   const struct blorp_params *params);
68 
69 static void *
70 blorp_alloc_dynamic_state(struct blorp_batch *batch,
71                           uint32_t size,
72                           uint32_t alignment,
73                           uint32_t *offset);
74 
75 UNUSED static void *
76 blorp_alloc_general_state(struct blorp_batch *batch,
77                           uint32_t size,
78                           uint32_t alignment,
79                           uint32_t *offset);
80 
81 static void *
82 blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
83                           struct blorp_address *addr);
84 static void
85 blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
86                                            const struct blorp_address *addrs,
87                                            uint32_t *sizes,
88                                            unsigned num_vbs);
89 
90 UNUSED static struct blorp_address
91 blorp_get_workaround_address(struct blorp_batch *batch);
92 
93 static bool
94 blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
95                           unsigned state_size, unsigned state_alignment,
96                           uint32_t *bt_offset, uint32_t *surface_offsets,
97                           void **surface_maps);
98 
99 static uint32_t
100 blorp_binding_table_offset_to_pointer(struct blorp_batch *batch,
101                                       uint32_t offset);
102 
103 static void
104 blorp_flush_range(struct blorp_batch *batch, void *start, size_t size);
105 
106 static void
107 blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
108                     struct blorp_address address, uint32_t delta);
109 
110 static uint64_t
111 blorp_get_surface_address(struct blorp_batch *batch,
112                           struct blorp_address address);
113 
114 #if GFX_VER >= 7
115 static struct blorp_address
116 blorp_get_surface_base_address(struct blorp_batch *batch);
117 #endif
118 
119 #if GFX_VER >= 7
120 static const struct intel_l3_config *
121 blorp_get_l3_config(struct blorp_batch *batch);
122 #endif
123 
124 static void
125 blorp_pre_emit_urb_config(struct blorp_batch *batch,
126                           struct intel_urb_config *urb_config);
127 
128 static void
129 blorp_emit_urb_config(struct blorp_batch *batch,
130                       struct intel_urb_config *urb_config);
131 
132 static void
133 blorp_emit_pipeline(struct blorp_batch *batch,
134                     const struct blorp_params *params);
135 
136 static void
137 blorp_emit_pre_draw(struct blorp_batch *batch,
138                     const struct blorp_params *params);
139 static void
140 blorp_emit_post_draw(struct blorp_batch *batch,
141                      const struct blorp_params *params);
142 
143 static inline unsigned
elk_blorp_get_urb_length(const struct elk_wm_prog_data * prog_data)144 elk_blorp_get_urb_length(const struct elk_wm_prog_data *prog_data)
145 {
146    if (prog_data == NULL)
147       return 1;
148 
149    /* From the BSpec: 3D Pipeline - Strips and Fans - 3DSTATE_SBE
150     *
151     * read_length = ceiling((max_source_attr+1)/2)
152     */
153    return MAX2((prog_data->num_varying_inputs + 1) / 2, 1);
154 }
155 
156 /***** BEGIN blorp_exec implementation ******/
157 
158 static uint64_t
_blorp_combine_address(struct blorp_batch * batch,void * location,struct blorp_address address,uint32_t delta)159 _blorp_combine_address(struct blorp_batch *batch, void *location,
160                        struct blorp_address address, uint32_t delta)
161 {
162    if (address.buffer == NULL) {
163       return address.offset + delta;
164    } else {
165       return blorp_emit_reloc(batch, location, address, delta);
166    }
167 }
168 
169 #define __gen_address_type struct blorp_address
170 #define __gen_user_data struct blorp_batch
171 #define __gen_combine_address _blorp_combine_address
172 
173 #include "genxml/genX_pack.h"
174 #include "common/intel_genX_state_elk.h"
175 
176 #define _blorp_cmd_length(cmd) cmd ## _length
177 #define _blorp_cmd_length_bias(cmd) cmd ## _length_bias
178 #define _blorp_cmd_header(cmd) cmd ## _header
179 #define _blorp_cmd_pack(cmd) cmd ## _pack
180 
181 #define blorp_emit(batch, cmd, name)                              \
182    for (struct cmd name = { _blorp_cmd_header(cmd) },             \
183         *_dst = blorp_emit_dwords(batch, _blorp_cmd_length(cmd)); \
184         __builtin_expect(_dst != NULL, 1);                        \
185         _blorp_cmd_pack(cmd)(batch, (void *)_dst, &name),         \
186         _dst = NULL)
187 
188 #define blorp_emitn(batch, cmd, n, ...) ({                  \
189       uint32_t *_dw = blorp_emit_dwords(batch, n);          \
190       if (_dw) {                                            \
191          struct cmd template = {                            \
192             _blorp_cmd_header(cmd),                         \
193             .DWordLength = n - _blorp_cmd_length_bias(cmd), \
194             __VA_ARGS__                                     \
195          };                                                 \
196          _blorp_cmd_pack(cmd)(batch, _dw, &template);       \
197       }                                                     \
198       _dw ? _dw + 1 : NULL; /* Array starts at dw[1] */     \
199    })
200 
201 #define STRUCT_ZERO(S) ({ struct S t; memset(&t, 0, sizeof(t)); t; })
202 
203 #define blorp_emit_dynamic(batch, state, name, align, offset)      \
204    for (struct state name = STRUCT_ZERO(state),                         \
205         *_dst = blorp_alloc_dynamic_state(batch,                   \
206                                           _blorp_cmd_length(state) * 4, \
207                                           align, offset);               \
208         __builtin_expect(_dst != NULL, 1);                              \
209         _blorp_cmd_pack(state)(batch, (void *)_dst, &name),             \
210         blorp_flush_range(batch, _dst, _blorp_cmd_length(state) * 4),   \
211         _dst = NULL)
212 
213 /* 3DSTATE_URB
214  * 3DSTATE_URB_VS
215  * 3DSTATE_URB_HS
216  * 3DSTATE_URB_DS
217  * 3DSTATE_URB_GS
218  *
219  * Assign the entire URB to the VS. Even though the VS disabled, URB space
220  * is still needed because the clipper loads the VUE's from the URB. From
221  * the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE,
222  * Dword 1.15:0 "VS Number of URB Entries":
223  *     This field is always used (even if VS Function Enable is DISABLED).
224  *
225  * The warning below appears in the PRM (Section 3DSTATE_URB), but we can
226  * safely ignore it because this batch contains only one draw call.
227  *     Because of URB corruption caused by allocating a previous GS unit
228  *     URB entry to the VS unit, software is required to send a “GS NULL
229  *     Fence” (Send URB fence with VS URB size == 1 and GS URB size == 0)
230  *     plus a dummy DRAW call before any case where VS will be taking over
231  *     GS URB space.
232  *
233  * If the 3DSTATE_URB_VS is emitted, than the others must be also.
234  * From the Ivybridge PRM, Volume 2 Part 1, section 1.7.1 3DSTATE_URB_VS:
235  *
236  *     3DSTATE_URB_HS, 3DSTATE_URB_DS, and 3DSTATE_URB_GS must also be
237  *     programmed in order for the programming of this state to be
238  *     valid.
239  */
240 static void
emit_urb_config(struct blorp_batch * batch,const struct blorp_params * params,UNUSED enum intel_urb_deref_block_size * deref_block_size)241 emit_urb_config(struct blorp_batch *batch,
242                 const struct blorp_params *params,
243                 UNUSED enum intel_urb_deref_block_size *deref_block_size)
244 {
245    /* Once vertex fetcher has written full VUE entries with complete
246     * header the space requirement is as follows per vertex (in bytes):
247     *
248     *     Header    Position    Program constants
249     *   +--------+------------+-------------------+
250     *   |   16   |     16     |      n x 16       |
251     *   +--------+------------+-------------------+
252     *
253     * where 'n' stands for number of varying inputs expressed as vec4s.
254     */
255    struct elk_wm_prog_data *wm_prog_data = params->wm_prog_data;
256    const unsigned num_varyings =
257       wm_prog_data ? wm_prog_data->num_varying_inputs : 0;
258    const unsigned total_needed = 16 + 16 + num_varyings * 16;
259 
260    /* The URB size is expressed in units of 64 bytes (512 bits) */
261    const unsigned vs_entry_size = DIV_ROUND_UP(total_needed, 64);
262 
263    ASSERTED struct elk_sf_prog_data *sf_prog_data = params->sf_prog_data;
264    ASSERTED const unsigned sf_entry_size =
265       sf_prog_data ? sf_prog_data->urb_entry_size : 0;
266 
267 #if GFX_VER >= 7
268    assert(sf_entry_size == 0);
269 
270    struct intel_urb_config urb_cfg = {
271       .size = { vs_entry_size, 1, 1, 1 },
272    };
273 
274    bool constrained;
275    intel_get_urb_config(batch->blorp->compiler->elk->devinfo,
276                         blorp_get_l3_config(batch),
277                         false, false, &urb_cfg,
278                         deref_block_size, &constrained);
279 
280    /* Tell drivers about the config. */
281    blorp_pre_emit_urb_config(batch, &urb_cfg);
282 
283 #if GFX_VERx10 == 70
284    /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
285     *
286     *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall
287     *    needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
288     *    3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
289     *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL
290     *    needs to be sent before any combination of VS associated 3DSTATE."
291     */
292    blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
293       pc.DepthStallEnable  = true;
294       pc.PostSyncOperation = WriteImmediateData;
295       pc.Address           = blorp_get_workaround_address(batch);
296    }
297 #endif
298 
299    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
300       blorp_emit(batch, GENX(3DSTATE_URB_VS), urb) {
301          urb._3DCommandSubOpcode      += i;
302          urb.VSURBStartingAddress      = urb_cfg.start[i];
303          urb.VSURBEntryAllocationSize  = urb_cfg.size[i] - 1;
304          urb.VSNumberofURBEntries      = urb_cfg.entries[i];
305       }
306    }
307 
308 #else /* GFX_VER < 7 */
309    struct intel_urb_config urb_cfg = {
310       .size = { vs_entry_size, 0, 0, 0, sf_entry_size, },
311    };
312    blorp_emit_urb_config(batch, &urb_cfg);
313 #endif
314 }
315 
316 #if GFX_VER >= 7
317 static void
318 blorp_emit_memcpy(struct blorp_batch *batch,
319                   struct blorp_address dst,
320                   struct blorp_address src,
321                   uint32_t size);
322 #endif
323 
324 static void
blorp_emit_vertex_data(struct blorp_batch * batch,const struct blorp_params * params,struct blorp_address * addr,uint32_t * size)325 blorp_emit_vertex_data(struct blorp_batch *batch,
326                        const struct blorp_params *params,
327                        struct blorp_address *addr,
328                        uint32_t *size)
329 {
330    const float vertices[] = {
331       /* v0 */ (float)params->x1, (float)params->y1, params->z,
332       /* v1 */ (float)params->x0, (float)params->y1, params->z,
333       /* v2 */ (float)params->x0, (float)params->y0, params->z,
334    };
335 
336    void *data = blorp_alloc_vertex_buffer(batch, sizeof(vertices), addr);
337    memcpy(data, vertices, sizeof(vertices));
338    *size = sizeof(vertices);
339    blorp_flush_range(batch, data, *size);
340 }
341 
342 static void
blorp_emit_input_varying_data(struct blorp_batch * batch,const struct blorp_params * params,struct blorp_address * addr,uint32_t * size)343 blorp_emit_input_varying_data(struct blorp_batch *batch,
344                               const struct blorp_params *params,
345                               struct blorp_address *addr,
346                               uint32_t *size)
347 {
348    const unsigned vec4_size_in_bytes = 4 * sizeof(float);
349    const unsigned max_num_varyings =
350       DIV_ROUND_UP(sizeof(params->wm_inputs), vec4_size_in_bytes);
351    struct elk_wm_prog_data *wm_prog_data = params->wm_prog_data;
352    const unsigned num_varyings =
353       wm_prog_data ? wm_prog_data->num_varying_inputs : 0;
354 
355    *size = 16 + num_varyings * vec4_size_in_bytes;
356 
357    const uint32_t *const inputs_src = (const uint32_t *)&params->wm_inputs;
358    void *data = blorp_alloc_vertex_buffer(batch, *size, addr);
359    if (data == NULL)
360       return;
361    uint32_t *inputs = data;
362 
363    /* Copy in the VS inputs */
364    assert(sizeof(params->vs_inputs) == 16);
365    memcpy(inputs, &params->vs_inputs, sizeof(params->vs_inputs));
366    inputs += 4;
367 
368    if (params->wm_prog_data) {
369       /* Walk over the attribute slots, determine if the attribute is used by
370        * the program and when necessary copy the values from the input storage
371        * to the vertex data buffer.
372        */
373       for (unsigned i = 0; i < max_num_varyings; i++) {
374          const gl_varying_slot attr = VARYING_SLOT_VAR0 + i;
375 
376          const int input_index = wm_prog_data->urb_setup[attr];
377          if (input_index < 0)
378             continue;
379 
380          memcpy(inputs, inputs_src + i * 4, vec4_size_in_bytes);
381 
382          inputs += 4;
383       }
384    }
385 
386    blorp_flush_range(batch, data, *size);
387 
388    if (params->dst_clear_color_as_input) {
389 #if GFX_VER >= 7
390       /* In this case, the clear color isn't known statically and instead
391        * comes in through an indirect which we have to copy into the vertex
392        * buffer before we execute the 3DPRIMITIVE.  We already copied the
393        * value of params->wm_inputs.clear_color into the vertex buffer in the
394        * loop above.  Now we emit code to stomp it from the GPU with the
395        * actual clear color value.
396        */
397       assert(num_varyings == 1);
398 
399       /* The clear color is the first thing after the header */
400       struct blorp_address clear_color_input_addr = *addr;
401       clear_color_input_addr.offset += 16;
402 
403       const unsigned clear_color_size = batch->blorp->isl_dev->ss.clear_value_size;
404       blorp_emit_memcpy(batch, clear_color_input_addr,
405                         params->dst.clear_color_addr,
406                         clear_color_size);
407 #else
408       unreachable("MCS partial resolve is not a thing on SNB and earlier");
409 #endif
410    }
411 }
412 
413 static void
blorp_fill_vertex_buffer_state(struct GENX (VERTEX_BUFFER_STATE)* vb,unsigned idx,struct blorp_address addr,uint32_t size,uint32_t stride)414 blorp_fill_vertex_buffer_state(struct GENX(VERTEX_BUFFER_STATE) *vb,
415                                unsigned idx,
416                                struct blorp_address addr, uint32_t size,
417                                uint32_t stride)
418 {
419    vb[idx].VertexBufferIndex = idx;
420    vb[idx].BufferStartingAddress = addr;
421    vb[idx].BufferPitch = stride;
422 
423 #if GFX_VER >= 6
424    vb[idx].MOCS = addr.mocs;
425 #endif
426 
427 #if GFX_VER >= 7
428    vb[idx].AddressModifyEnable = true;
429 #endif
430 
431 #if GFX_VER >= 8
432    vb[idx].BufferSize = size;
433 #elif GFX_VER >= 5
434    vb[idx].BufferAccessType = stride > 0 ? VERTEXDATA : INSTANCEDATA;
435    vb[idx].EndAddress = vb[idx].BufferStartingAddress;
436    vb[idx].EndAddress.offset += size - 1;
437 #elif GFX_VER == 4
438    vb[idx].BufferAccessType = stride > 0 ? VERTEXDATA : INSTANCEDATA;
439    vb[idx].MaxIndex = stride > 0 ? size / stride : 0;
440 #endif
441 }
442 
443 static void
blorp_emit_vertex_buffers(struct blorp_batch * batch,const struct blorp_params * params)444 blorp_emit_vertex_buffers(struct blorp_batch *batch,
445                           const struct blorp_params *params)
446 {
447    struct GENX(VERTEX_BUFFER_STATE) vb[2] = {};
448    const uint32_t num_vbs = ARRAY_SIZE(vb);
449 
450    struct blorp_address addrs[2] = {};
451    uint32_t sizes[2] = {};
452    blorp_emit_vertex_data(batch, params, &addrs[0], &sizes[0]);
453    if (sizes[0] == 0)
454       return;
455    blorp_fill_vertex_buffer_state(vb, 0, addrs[0], sizes[0],
456                                   3 * sizeof(float));
457 
458    blorp_emit_input_varying_data(batch, params, &addrs[1], &sizes[1]);
459    blorp_fill_vertex_buffer_state(vb, 1, addrs[1], sizes[1], 0);
460 
461    blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, sizes, num_vbs);
462 
463    const unsigned num_dwords = 1 + num_vbs * GENX(VERTEX_BUFFER_STATE_length);
464    uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
465    if (!dw)
466       return;
467 
468    for (unsigned i = 0; i < num_vbs; i++) {
469       GENX(VERTEX_BUFFER_STATE_pack)(batch, dw, &vb[i]);
470       dw += GENX(VERTEX_BUFFER_STATE_length);
471    }
472 }
473 
474 static void
blorp_emit_vertex_elements(struct blorp_batch * batch,const struct blorp_params * params)475 blorp_emit_vertex_elements(struct blorp_batch *batch,
476                            const struct blorp_params *params)
477 {
478    struct elk_wm_prog_data *wm_prog_data = params->wm_prog_data;
479    const unsigned num_varyings =
480       wm_prog_data ? wm_prog_data->num_varying_inputs : 0;
481    bool need_ndc = batch->blorp->compiler->elk->devinfo->ver <= 5;
482    const unsigned num_elements = 2 + need_ndc + num_varyings;
483 
484    struct GENX(VERTEX_ELEMENT_STATE) ve[num_elements];
485    memset(ve, 0, num_elements * sizeof(*ve));
486 
487    /* Setup VBO for the rectangle primitive..
488     *
489     * A rectangle primitive (3DPRIM_RECTLIST) consists of only three
490     * vertices. The vertices reside in screen space with DirectX
491     * coordinates (that is, (0, 0) is the upper left corner).
492     *
493     *   v2 ------ implied
494     *    |        |
495     *    |        |
496     *   v1 ----- v0
497     *
498     * Since the VS is disabled, the clipper loads each VUE directly from
499     * the URB. This is controlled by the 3DSTATE_VERTEX_BUFFERS and
500     * 3DSTATE_VERTEX_ELEMENTS packets below. The VUE contents are as follows:
501     *   dw0: Reserved, MBZ.
502     *   dw1: Render Target Array Index. Below vertex fetcher gets programmed
503     *        to assign this with primitive instance identifier which will be
504     *        used for layered clears. All other renders have only one instance
505     *        and therefore the value will be effectively zero.
506     *   dw2: Viewport Index. The HiZ op disables viewport mapping and
507     *        scissoring, so set the dword to 0.
508     *   dw3: Point Width: The HiZ op does not emit the POINTLIST primitive,
509     *        so set the dword to 0.
510     *   dw4: Vertex Position X.
511     *   dw5: Vertex Position Y.
512     *   dw6: Vertex Position Z.
513     *   dw7: Vertex Position W.
514     *
515     *   dw8: Flat vertex input 0
516     *   dw9: Flat vertex input 1
517     *   ...
518     *   dwn: Flat vertex input n - 8
519     *
520     * For details, see the Sandybridge PRM, Volume 2, Part 1, Section 1.5.1
521     * "Vertex URB Entry (VUE) Formats".
522     *
523     * Only vertex position X and Y are going to be variable, Z is fixed to
524     * zero and W to one. Header words dw0,2,3 are zero. There is no need to
525     * include the fixed values in the vertex buffer. Vertex fetcher can be
526     * instructed to fill vertex elements with constant values of one and zero
527     * instead of reading them from the buffer.
528     * Flat inputs are program constants that are not interpolated. Moreover
529     * their values will be the same between vertices.
530     *
531     * See the vertex element setup below.
532     */
533    unsigned slot = 0;
534 
535    ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
536       .VertexBufferIndex = 1,
537       .Valid = true,
538       .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
539       .SourceElementOffset = 0,
540       .Component0Control = VFCOMP_STORE_SRC,
541 
542       /* From Gfx8 onwards hardware is no more instructed to overwrite
543        * components using an element specifier. Instead one has separate
544        * 3DSTATE_VF_SGVS (System Generated Value Setup) state packet for it.
545        */
546 #if GFX_VER >= 8
547       .Component1Control = VFCOMP_STORE_0,
548 #elif GFX_VER >= 5
549       .Component1Control = VFCOMP_STORE_IID,
550 #else
551       .Component1Control = VFCOMP_STORE_0,
552 #endif
553       .Component2Control = VFCOMP_STORE_0,
554       .Component3Control = VFCOMP_STORE_0,
555 #if GFX_VER <= 5
556       .DestinationElementOffset = slot * 4,
557 #endif
558    };
559    slot++;
560 
561 #if GFX_VER <= 5
562    /* On Iron Lake and earlier, a native device coordinates version of the
563     * position goes right after the normal VUE header and before position.
564     * Since w == 1 for all of our coordinates, this is just a copy of the
565     * position.
566     */
567    ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
568       .VertexBufferIndex = 0,
569       .Valid = true,
570       .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
571       .SourceElementOffset = 0,
572       .Component0Control = VFCOMP_STORE_SRC,
573       .Component1Control = VFCOMP_STORE_SRC,
574       .Component2Control = VFCOMP_STORE_SRC,
575       .Component3Control = VFCOMP_STORE_1_FP,
576       .DestinationElementOffset = slot * 4,
577    };
578    slot++;
579 #endif
580 
581    ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
582       .VertexBufferIndex = 0,
583       .Valid = true,
584       .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
585       .SourceElementOffset = 0,
586       .Component0Control = VFCOMP_STORE_SRC,
587       .Component1Control = VFCOMP_STORE_SRC,
588       .Component2Control = VFCOMP_STORE_SRC,
589       .Component3Control = VFCOMP_STORE_1_FP,
590 #if GFX_VER <= 5
591       .DestinationElementOffset = slot * 4,
592 #endif
593    };
594    slot++;
595 
596    for (unsigned i = 0; i < num_varyings; ++i) {
597       ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
598          .VertexBufferIndex = 1,
599          .Valid = true,
600          .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
601          .SourceElementOffset = 16 + i * 4 * sizeof(float),
602          .Component0Control = VFCOMP_STORE_SRC,
603          .Component1Control = VFCOMP_STORE_SRC,
604          .Component2Control = VFCOMP_STORE_SRC,
605          .Component3Control = VFCOMP_STORE_SRC,
606 #if GFX_VER <= 5
607          .DestinationElementOffset = slot * 4,
608 #endif
609       };
610       slot++;
611    }
612 
613    const unsigned num_dwords =
614       1 + GENX(VERTEX_ELEMENT_STATE_length) * num_elements;
615    uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_ELEMENTS), num_dwords);
616    if (!dw)
617       return;
618 
619    for (unsigned i = 0; i < num_elements; i++) {
620       GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw, &ve[i]);
621       dw += GENX(VERTEX_ELEMENT_STATE_length);
622    }
623 
624    blorp_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
625       vf.StatisticsEnable = false;
626    }
627 
628 #if GFX_VER >= 8
629    /* Overwrite Render Target Array Index (2nd dword) in the VUE header with
630     * primitive instance identifier. This is used for layered clears.
631     */
632    blorp_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs) {
633       sgvs.InstanceIDEnable = true;
634       sgvs.InstanceIDComponentNumber = COMP_1;
635       sgvs.InstanceIDElementOffset = 0;
636    }
637 
638    for (unsigned i = 0; i < num_elements; i++) {
639       blorp_emit(batch, GENX(3DSTATE_VF_INSTANCING), vf) {
640          vf.VertexElementIndex = i;
641          vf.InstancingEnable = false;
642       }
643    }
644 
645    blorp_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
646       topo.PrimitiveTopologyType = _3DPRIM_RECTLIST;
647    }
648 #endif
649 }
650 
651 /* 3DSTATE_VIEWPORT_STATE_POINTERS */
652 static uint32_t
blorp_emit_cc_viewport(struct blorp_batch * batch)653 blorp_emit_cc_viewport(struct blorp_batch *batch)
654 {
655    uint32_t cc_vp_offset;
656    blorp_emit_dynamic(batch, GENX(CC_VIEWPORT), vp, 32, &cc_vp_offset) {
657       vp.MinimumDepth = batch->blorp->config.use_unrestricted_depth_range ?
658                            -FLT_MAX : 0.0;
659       vp.MaximumDepth = batch->blorp->config.use_unrestricted_depth_range ?
660                            FLT_MAX : 1.0;
661    }
662 
663 #if GFX_VER >= 7
664    blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), vsp) {
665       vsp.CCViewportPointer = cc_vp_offset;
666    }
667 #elif GFX_VER == 6
668    blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vsp) {
669       vsp.CCViewportStateChange = true;
670       vsp.PointertoCC_VIEWPORT = cc_vp_offset;
671    }
672 #endif
673 
674    return cc_vp_offset;
675 }
676 
677 static uint32_t
blorp_emit_sampler_state(struct blorp_batch * batch)678 blorp_emit_sampler_state(struct blorp_batch *batch)
679 {
680    uint32_t offset;
681    blorp_emit_dynamic(batch, GENX(SAMPLER_STATE), sampler, 32, &offset) {
682       sampler.MipModeFilter = MIPFILTER_NONE;
683       sampler.MagModeFilter = MAPFILTER_LINEAR;
684       sampler.MinModeFilter = MAPFILTER_LINEAR;
685       sampler.MinLOD = 0;
686       sampler.MaxLOD = 0;
687       sampler.TCXAddressControlMode = TCM_CLAMP;
688       sampler.TCYAddressControlMode = TCM_CLAMP;
689       sampler.TCZAddressControlMode = TCM_CLAMP;
690       sampler.MaximumAnisotropy = RATIO21;
691       sampler.RAddressMinFilterRoundingEnable = true;
692       sampler.RAddressMagFilterRoundingEnable = true;
693       sampler.VAddressMinFilterRoundingEnable = true;
694       sampler.VAddressMagFilterRoundingEnable = true;
695       sampler.UAddressMinFilterRoundingEnable = true;
696       sampler.UAddressMagFilterRoundingEnable = true;
697 #if GFX_VER > 6
698       sampler.NonnormalizedCoordinateEnable = true;
699 #endif
700    }
701 
702    return offset;
703 }
704 
705 UNUSED static uint32_t
blorp_emit_sampler_state_ps(struct blorp_batch * batch)706 blorp_emit_sampler_state_ps(struct blorp_batch *batch)
707 {
708    uint32_t offset = blorp_emit_sampler_state(batch);
709 
710 #if GFX_VER >= 7
711    blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_PS), ssp) {
712       ssp.PointertoPSSamplerState = offset;
713    }
714 #elif GFX_VER == 6
715    blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ssp) {
716       ssp.VSSamplerStateChange = true;
717       ssp.GSSamplerStateChange = true;
718       ssp.PSSamplerStateChange = true;
719       ssp.PointertoPSSamplerState = offset;
720    }
721 #endif
722 
723    return offset;
724 }
725 
726 /* What follows is the code for setting up a "pipeline" on Sandy Bridge and
727  * later hardware.  This file will be included by i965 for gfx4-5 as well, so
728  * this code is guarded by GFX_VER >= 6.
729  */
730 #if GFX_VER >= 6
731 
732 static void
blorp_emit_vs_config(struct blorp_batch * batch,const struct blorp_params * params)733 blorp_emit_vs_config(struct blorp_batch *batch,
734                      const struct blorp_params *params)
735 {
736    struct elk_vs_prog_data *vs_prog_data = params->vs_prog_data;
737 
738    blorp_emit(batch, GENX(3DSTATE_VS), vs) {
739       if (vs_prog_data) {
740          vs.Enable = true;
741 
742          vs.KernelStartPointer = params->vs_prog_kernel;
743 
744          vs.DispatchGRFStartRegisterForURBData =
745             vs_prog_data->base.base.dispatch_grf_start_reg;
746          vs.VertexURBEntryReadLength =
747             vs_prog_data->base.urb_read_length;
748          vs.VertexURBEntryReadOffset = 0;
749 
750          vs.MaximumNumberofThreads =
751             batch->blorp->isl_dev->info->max_vs_threads - 1;
752 
753          assert(GFX_VER < 8 ||
754                 vs_prog_data->base.dispatch_mode == INTEL_DISPATCH_MODE_SIMD8);
755 #if GFX_VER >= 8 && GFX_VER < 20
756          vs.SIMD8DispatchEnable = true;
757 #endif
758       }
759    }
760 }
761 
762 static void
blorp_emit_sf_config(struct blorp_batch * batch,const struct blorp_params * params,UNUSED enum intel_urb_deref_block_size urb_deref_block_size)763 blorp_emit_sf_config(struct blorp_batch *batch,
764                      const struct blorp_params *params,
765                      UNUSED enum intel_urb_deref_block_size urb_deref_block_size)
766 {
767    const struct elk_wm_prog_data *prog_data = params->wm_prog_data;
768 
769    /* 3DSTATE_SF
770     *
771     * Disable ViewportTransformEnable (dw2.1)
772     *
773     * From the SandyBridge PRM, Volume 2, Part 1, Section 1.3, "3D
774     * Primitives Overview":
775     *     RECTLIST: Viewport Mapping must be DISABLED (as is typical with the
776     *     use of screen- space coordinates).
777     *
778     * A solid rectangle must be rendered, so set FrontFaceFillMode (dw2.4:3)
779     * and BackFaceFillMode (dw2.5:6) to SOLID(0).
780     *
781     * From the Sandy Bridge PRM, Volume 2, Part 1, Section
782     * 6.4.1.1 3DSTATE_SF, Field FrontFaceFillMode:
783     *     SOLID: Any triangle or rectangle object found to be front-facing
784     *     is rendered as a solid object. This setting is required when
785     *     (rendering rectangle (RECTLIST) objects.
786     */
787 
788 #if GFX_VER >= 8
789 
790    blorp_emit(batch, GENX(3DSTATE_SF), sf) {
791    }
792 
793    blorp_emit(batch, GENX(3DSTATE_RASTER), raster) {
794       raster.CullMode = CULLMODE_NONE;
795    }
796 
797    blorp_emit(batch, GENX(3DSTATE_SBE), sbe) {
798       sbe.VertexURBEntryReadOffset = 1;
799       if (prog_data) {
800          sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
801          sbe.VertexURBEntryReadLength = elk_blorp_get_urb_length(prog_data);
802          sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
803       } else {
804          sbe.NumberofSFOutputAttributes = 0;
805          sbe.VertexURBEntryReadLength = 1;
806       }
807       sbe.ForceVertexURBEntryReadLength = true;
808       sbe.ForceVertexURBEntryReadOffset = true;
809    }
810 
811 #elif GFX_VER >= 7
812 
813    blorp_emit(batch, GENX(3DSTATE_SF), sf) {
814       sf.FrontFaceFillMode = FILL_MODE_SOLID;
815       sf.BackFaceFillMode = FILL_MODE_SOLID;
816 
817       sf.MultisampleRasterizationMode = params->num_samples > 1 ?
818          MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
819 
820 #if GFX_VER == 7
821       sf.DepthBufferSurfaceFormat = params->depth_format;
822 #endif
823    }
824 
825    blorp_emit(batch, GENX(3DSTATE_SBE), sbe) {
826       sbe.VertexURBEntryReadOffset = 1;
827       if (prog_data) {
828          sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
829          sbe.VertexURBEntryReadLength = elk_blorp_get_urb_length(prog_data);
830          sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
831       } else {
832          sbe.NumberofSFOutputAttributes = 0;
833          sbe.VertexURBEntryReadLength = 1;
834       }
835    }
836 
837 #else /* GFX_VER <= 6 */
838 
839    blorp_emit(batch, GENX(3DSTATE_SF), sf) {
840       sf.FrontFaceFillMode = FILL_MODE_SOLID;
841       sf.BackFaceFillMode = FILL_MODE_SOLID;
842 
843       sf.MultisampleRasterizationMode = params->num_samples > 1 ?
844          MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
845 
846       sf.VertexURBEntryReadOffset = 1;
847       if (prog_data) {
848          sf.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
849          sf.VertexURBEntryReadLength = elk_blorp_get_urb_length(prog_data);
850          sf.ConstantInterpolationEnable = prog_data->flat_inputs;
851       } else {
852          sf.NumberofSFOutputAttributes = 0;
853          sf.VertexURBEntryReadLength = 1;
854       }
855    }
856 
857 #endif /* GFX_VER */
858 }
859 
860 static void
blorp_emit_ps_config(struct blorp_batch * batch,const struct blorp_params * params)861 blorp_emit_ps_config(struct blorp_batch *batch,
862                      const struct blorp_params *params)
863 {
864    const struct elk_wm_prog_data *prog_data = params->wm_prog_data;
865 
866    /* Even when thread dispatch is disabled, max threads (dw5.25:31) must be
867     * nonzero to prevent the GPU from hanging.  While the documentation doesn't
868     * mention this explicitly, it notes that the valid range for the field is
869     * [1,39] = [2,40] threads, which excludes zero.
870     *
871     * To be safe (and to minimize extraneous code) we go ahead and fully
872     * configure the WM state whether or not there is a WM program.
873     */
874 
875 #if GFX_VER >= 8
876    const struct intel_device_info *devinfo = batch->blorp->compiler->elk->devinfo;
877 
878    blorp_emit(batch, GENX(3DSTATE_WM), wm);
879 
880    blorp_emit(batch, GENX(3DSTATE_PS), ps) {
881       if (params->src.enabled) {
882          ps.SamplerCount = 1; /* Up to 4 samplers */
883          ps.BindingTableEntryCount = 2;
884       } else {
885          ps.BindingTableEntryCount = 1;
886       }
887 
888       /* SAMPLER_STATE prefetching is broken on Gfx11 - Wa_1606682166 */
889       if (GFX_VER == 11)
890          ps.SamplerCount = 0;
891 
892       /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
893        * for pre Gfx11 and 128 for gfx11+; On gfx11+ If a programmed value is
894        * k, it implies 2(k+1) threads. It implicitly scales for different GT
895        * levels (which have some # of PSDs).
896        *
897        * In Gfx8 the format is U8-2 whereas in Gfx9+ it is U9-1.
898        */
899       ps.MaximumNumberofThreadsPerPSD =
900          devinfo->max_threads_per_psd - (GFX_VER == 8 ? 2 : 1);
901 
902       switch (params->fast_clear_op) {
903       case ISL_AUX_OP_NONE:
904          break;
905       case ISL_AUX_OP_FULL_RESOLVE:
906          ps.RenderTargetResolveEnable = true;
907          break;
908       case ISL_AUX_OP_FAST_CLEAR:
909          ps.RenderTargetFastClearEnable = true;
910          break;
911       default:
912          unreachable("Invalid fast clear op");
913       }
914 
915       /* The RENDER_SURFACE_STATE page for TGL says:
916        *
917        *   For an 8 bpp surface with NUM_MULTISAMPLES = 1, Surface Width not
918        *   multiple of 64 pixels and more than 1 mip level in the view, Fast
919        *   Clear is not supported when AUX_CCS_E is set in this field.
920        *
921        * The granularity of a fast-clear or ambiguate operation is likely one
922        * CCS element. For an 8 bpp primary surface, this maps to 32px x 4rows.
923        * Due to the surface layout parameters, if LOD0's width isn't a
924        * multiple of 64px, LOD1 and LOD2+ will share CCS elements. Assert that
925        * these operations aren't occurring on these LODs.
926        *
927        * We don't explicitly check for TGL+ because the restriction is
928        * technically applicable to all hardware. Platforms prior to TGL don't
929        * support CCS on 8 bpp surfaces. So, these unaligned fast clear
930        * operations shouldn't be occurring prior to TGL as well.
931        */
932       if (isl_format_get_layout(params->dst.surf.format)->bpb == 8 &&
933           params->dst.surf.logical_level0_px.width % 64 != 0 &&
934           params->dst.surf.levels >= 3 &&
935           params->dst.view.base_level >= 1) {
936          assert(params->num_samples == 1);
937          assert(!ps.RenderTargetFastClearEnable);
938       }
939 
940       if (prog_data) {
941          intel_set_ps_dispatch_state(&ps, devinfo, prog_data,
942                                      params->num_samples,
943                                      0 /* msaa_flags */);
944 
945          ps.DispatchGRFStartRegisterForConstantSetupData0 =
946             elk_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
947          ps.DispatchGRFStartRegisterForConstantSetupData1 =
948             elk_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
949          ps.DispatchGRFStartRegisterForConstantSetupData2 =
950             elk_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
951 
952          ps.KernelStartPointer0 = params->wm_prog_kernel +
953                                   elk_wm_prog_data_prog_offset(prog_data, ps, 0);
954          ps.KernelStartPointer1 = params->wm_prog_kernel +
955                                   elk_wm_prog_data_prog_offset(prog_data, ps, 1);
956          ps.KernelStartPointer2 = params->wm_prog_kernel +
957                                   elk_wm_prog_data_prog_offset(prog_data, ps, 2);
958       }
959    }
960 
961    blorp_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) {
962       if (prog_data) {
963          psx.PixelShaderValid = true;
964          psx.AttributeEnable = prog_data->num_varying_inputs > 0;
965          psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
966          psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
967       }
968 
969       if (params->src.enabled)
970          psx.PixelShaderKillsPixel = true;
971    }
972 
973 #elif GFX_VER >= 7
974    const struct intel_device_info *devinfo = batch->blorp->compiler->elk->devinfo;
975 
976    blorp_emit(batch, GENX(3DSTATE_WM), wm) {
977       switch (params->hiz_op) {
978       case ISL_AUX_OP_FAST_CLEAR:
979          wm.DepthBufferClear = true;
980          break;
981       case ISL_AUX_OP_FULL_RESOLVE:
982          wm.DepthBufferResolveEnable = true;
983          break;
984       case ISL_AUX_OP_AMBIGUATE:
985          wm.HierarchicalDepthBufferResolveEnable = true;
986          break;
987       case ISL_AUX_OP_NONE:
988          break;
989       default:
990          unreachable("not reached");
991       }
992 
993       if (prog_data) {
994          wm.ThreadDispatchEnable = true;
995          wm.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
996       }
997 
998       if (params->src.enabled)
999          wm.PixelShaderKillsPixel = true;
1000 
1001       if (params->num_samples > 1) {
1002          wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1003          wm.MultisampleDispatchMode =
1004             (prog_data && prog_data->persample_dispatch) ?
1005             MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
1006       } else {
1007          wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
1008          wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1009       }
1010    }
1011 
1012    blorp_emit(batch, GENX(3DSTATE_PS), ps) {
1013       ps.MaximumNumberofThreads =
1014          batch->blorp->isl_dev->info->max_wm_threads - 1;
1015 
1016 #if GFX_VERx10 == 75
1017       ps.SampleMask = 1;
1018 #endif
1019 
1020       if (prog_data) {
1021          intel_set_ps_dispatch_state(&ps, devinfo, prog_data,
1022                                      params->num_samples,
1023                                      0 /* msaa_flags */);
1024 
1025          ps.DispatchGRFStartRegisterForConstantSetupData0 =
1026             elk_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
1027          ps.DispatchGRFStartRegisterForConstantSetupData1 =
1028             elk_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
1029          ps.DispatchGRFStartRegisterForConstantSetupData2 =
1030             elk_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
1031 
1032          ps.KernelStartPointer0 = params->wm_prog_kernel +
1033                                   elk_wm_prog_data_prog_offset(prog_data, ps, 0);
1034          ps.KernelStartPointer1 = params->wm_prog_kernel +
1035                                   elk_wm_prog_data_prog_offset(prog_data, ps, 1);
1036          ps.KernelStartPointer2 = params->wm_prog_kernel +
1037                                   elk_wm_prog_data_prog_offset(prog_data, ps, 2);
1038 
1039          ps.AttributeEnable = prog_data->num_varying_inputs > 0;
1040       } else {
1041          /* Gfx7 hardware gets angry if we don't enable at least one dispatch
1042           * mode, so just enable 16-pixel dispatch if we don't have a program.
1043           */
1044          ps._16PixelDispatchEnable = true;
1045       }
1046 
1047       if (params->src.enabled)
1048          ps.SamplerCount = 1; /* Up to 4 samplers */
1049 
1050       switch (params->fast_clear_op) {
1051       case ISL_AUX_OP_NONE:
1052          break;
1053       case ISL_AUX_OP_FULL_RESOLVE:
1054          ps.RenderTargetResolveEnable = true;
1055          break;
1056       case ISL_AUX_OP_FAST_CLEAR:
1057          ps.RenderTargetFastClearEnable = true;
1058          break;
1059       default:
1060          unreachable("Invalid fast clear op");
1061       }
1062    }
1063 
1064 #else /* GFX_VER <= 6 */
1065 
1066    blorp_emit(batch, GENX(3DSTATE_WM), wm) {
1067       wm.MaximumNumberofThreads =
1068          batch->blorp->isl_dev->info->max_wm_threads - 1;
1069 
1070       switch (params->hiz_op) {
1071       case ISL_AUX_OP_FAST_CLEAR:
1072          wm.DepthBufferClear = true;
1073          break;
1074       case ISL_AUX_OP_FULL_RESOLVE:
1075          wm.DepthBufferResolveEnable = true;
1076          break;
1077       case ISL_AUX_OP_AMBIGUATE:
1078          wm.HierarchicalDepthBufferResolveEnable = true;
1079          break;
1080       case ISL_AUX_OP_NONE:
1081          break;
1082       default:
1083          unreachable("not reached");
1084       }
1085 
1086       if (prog_data) {
1087          wm.ThreadDispatchEnable = true;
1088 
1089          wm._8PixelDispatchEnable = prog_data->dispatch_8;
1090          wm._16PixelDispatchEnable = prog_data->dispatch_16;
1091          wm._32PixelDispatchEnable = prog_data->dispatch_32;
1092 
1093          wm.DispatchGRFStartRegisterForConstantSetupData0 =
1094             elk_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 0);
1095          wm.DispatchGRFStartRegisterForConstantSetupData1 =
1096             elk_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 1);
1097          wm.DispatchGRFStartRegisterForConstantSetupData2 =
1098             elk_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 2);
1099 
1100          wm.KernelStartPointer0 = params->wm_prog_kernel +
1101                                   elk_wm_prog_data_prog_offset(prog_data, wm, 0);
1102          wm.KernelStartPointer1 = params->wm_prog_kernel +
1103                                   elk_wm_prog_data_prog_offset(prog_data, wm, 1);
1104          wm.KernelStartPointer2 = params->wm_prog_kernel +
1105                                   elk_wm_prog_data_prog_offset(prog_data, wm, 2);
1106 
1107          wm.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
1108       }
1109 
1110       if (params->src.enabled) {
1111          wm.SamplerCount = 1; /* Up to 4 samplers */
1112          wm.PixelShaderKillsPixel = true; /* TODO: temporarily smash on */
1113       }
1114 
1115       if (params->num_samples > 1) {
1116          wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1117          wm.MultisampleDispatchMode =
1118             (prog_data && prog_data->persample_dispatch) ?
1119             MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
1120       } else {
1121          wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
1122          wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1123       }
1124    }
1125 
1126 #endif /* GFX_VER */
1127 }
1128 
1129 static uint32_t
blorp_emit_blend_state(struct blorp_batch * batch,const struct blorp_params * params)1130 blorp_emit_blend_state(struct blorp_batch *batch,
1131                        const struct blorp_params *params)
1132 {
1133    struct GENX(BLEND_STATE) blend = { };
1134 
1135    uint32_t offset;
1136    int size = GENX(BLEND_STATE_length) * 4;
1137    size += GENX(BLEND_STATE_ENTRY_length) * 4 * params->num_draw_buffers;
1138    uint32_t *state = blorp_alloc_dynamic_state(batch, size, 64, &offset);
1139    if (state == NULL)
1140       return 0;
1141    uint32_t *pos = state;
1142 
1143    GENX(BLEND_STATE_pack)(NULL, pos, &blend);
1144    pos += GENX(BLEND_STATE_length);
1145 
1146    for (unsigned i = 0; i < params->num_draw_buffers; ++i) {
1147       struct GENX(BLEND_STATE_ENTRY) entry = {
1148          .PreBlendColorClampEnable = true,
1149          .PostBlendColorClampEnable = true,
1150          .ColorClampRange = COLORCLAMP_RTFORMAT,
1151 
1152          .WriteDisableRed = params->color_write_disable & 1,
1153          .WriteDisableGreen = params->color_write_disable & 2,
1154          .WriteDisableBlue = params->color_write_disable & 4,
1155          .WriteDisableAlpha = params->color_write_disable & 8,
1156       };
1157       GENX(BLEND_STATE_ENTRY_pack)(NULL, pos, &entry);
1158       pos += GENX(BLEND_STATE_ENTRY_length);
1159    }
1160 
1161    blorp_flush_range(batch, state, size);
1162 
1163 #if GFX_VER >= 7
1164    blorp_emit(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), sp) {
1165       sp.BlendStatePointer = offset;
1166 #if GFX_VER >= 8
1167       sp.BlendStatePointerValid = true;
1168 #endif
1169    }
1170 #endif
1171 
1172 #if GFX_VER >= 8
1173    blorp_emit(batch, GENX(3DSTATE_PS_BLEND), ps_blend) {
1174       ps_blend.HasWriteableRT = true;
1175    }
1176 #endif
1177 
1178    return offset;
1179 }
1180 
1181 static uint32_t
blorp_emit_color_calc_state(struct blorp_batch * batch,UNUSED const struct blorp_params * params)1182 blorp_emit_color_calc_state(struct blorp_batch *batch,
1183                             UNUSED const struct blorp_params *params)
1184 {
1185    uint32_t offset;
1186    blorp_emit_dynamic(batch, GENX(COLOR_CALC_STATE), cc, 64, &offset) {
1187 #if GFX_VER <= 8
1188       cc.StencilReferenceValue = params->stencil_ref;
1189 #endif
1190    }
1191 
1192 #if GFX_VER >= 7
1193    blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), sp) {
1194       sp.ColorCalcStatePointer = offset;
1195 #if GFX_VER >= 8
1196       sp.ColorCalcStatePointerValid = true;
1197 #endif
1198    }
1199 #endif
1200 
1201    return offset;
1202 }
1203 
1204 static uint32_t
blorp_emit_depth_stencil_state(struct blorp_batch * batch,const struct blorp_params * params)1205 blorp_emit_depth_stencil_state(struct blorp_batch *batch,
1206                                const struct blorp_params *params)
1207 {
1208 #if GFX_VER >= 8
1209    struct GENX(3DSTATE_WM_DEPTH_STENCIL) ds = {
1210       GENX(3DSTATE_WM_DEPTH_STENCIL_header),
1211    };
1212 #else
1213    struct GENX(DEPTH_STENCIL_STATE) ds = { 0 };
1214 #endif
1215 
1216    if (params->depth.enabled) {
1217       ds.DepthBufferWriteEnable = true;
1218 
1219       switch (params->hiz_op) {
1220       /* See the following sections of the Sandy Bridge PRM, Volume 2, Part1:
1221        *   - 7.5.3.1 Depth Buffer Clear
1222        *   - 7.5.3.2 Depth Buffer Resolve
1223        *   - 7.5.3.3 Hierarchical Depth Buffer Resolve
1224        */
1225       case ISL_AUX_OP_FULL_RESOLVE:
1226          ds.DepthTestEnable = true;
1227          ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
1228          break;
1229 
1230       case ISL_AUX_OP_NONE:
1231       case ISL_AUX_OP_FAST_CLEAR:
1232       case ISL_AUX_OP_AMBIGUATE:
1233          ds.DepthTestEnable = false;
1234          break;
1235       case ISL_AUX_OP_PARTIAL_RESOLVE:
1236          unreachable("Invalid HIZ op");
1237       }
1238    }
1239 
1240    if (params->stencil.enabled) {
1241       ds.StencilBufferWriteEnable = true;
1242       ds.StencilTestEnable = true;
1243       ds.DoubleSidedStencilEnable = false;
1244 
1245       ds.StencilTestFunction = COMPAREFUNCTION_ALWAYS;
1246       ds.StencilPassDepthPassOp = STENCILOP_REPLACE;
1247 
1248       ds.StencilWriteMask = params->stencil_mask;
1249    }
1250 
1251 #if GFX_VER >= 8
1252    uint32_t offset = 0;
1253    uint32_t *dw = blorp_emit_dwords(batch,
1254                                     GENX(3DSTATE_WM_DEPTH_STENCIL_length));
1255    if (!dw)
1256       return 0;
1257 
1258    GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, dw, &ds);
1259 #else
1260    uint32_t offset;
1261    void *state = blorp_alloc_dynamic_state(batch,
1262                                            GENX(DEPTH_STENCIL_STATE_length) * 4,
1263                                            64, &offset);
1264    GENX(DEPTH_STENCIL_STATE_pack)(NULL, state, &ds);
1265    blorp_flush_range(batch, state, GENX(DEPTH_STENCIL_STATE_length) * 4);
1266 #endif
1267 
1268 #if GFX_VER == 7
1269    blorp_emit(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), sp) {
1270       sp.PointertoDEPTH_STENCIL_STATE = offset;
1271    }
1272 #endif
1273 
1274    return offset;
1275 }
1276 
1277 static void
blorp_emit_3dstate_multisample(struct blorp_batch * batch,const struct blorp_params * params)1278 blorp_emit_3dstate_multisample(struct blorp_batch *batch,
1279                                const struct blorp_params *params)
1280 {
1281    blorp_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
1282       ms.NumberofMultisamples       = __builtin_ffs(params->num_samples) - 1;
1283       ms.PixelLocation              = CENTER;
1284 #if GFX_VER >= 7 && GFX_VER < 8
1285       switch (params->num_samples) {
1286       case 1:
1287          INTEL_SAMPLE_POS_1X(ms.Sample);
1288          break;
1289       case 2:
1290          INTEL_SAMPLE_POS_2X(ms.Sample);
1291          break;
1292       case 4:
1293          INTEL_SAMPLE_POS_4X(ms.Sample);
1294          break;
1295       case 8:
1296          INTEL_SAMPLE_POS_8X(ms.Sample);
1297          break;
1298       default:
1299          break;
1300       }
1301 #elif GFX_VER < 7
1302       INTEL_SAMPLE_POS_4X(ms.Sample);
1303 #endif
1304    }
1305 }
1306 
1307 static void
blorp_emit_pipeline(struct blorp_batch * batch,const struct blorp_params * params)1308 blorp_emit_pipeline(struct blorp_batch *batch,
1309                     const struct blorp_params *params)
1310 {
1311    uint32_t blend_state_offset = 0;
1312    uint32_t color_calc_state_offset;
1313    uint32_t depth_stencil_state_offset;
1314 
1315    enum intel_urb_deref_block_size urb_deref_block_size;
1316    emit_urb_config(batch, params, &urb_deref_block_size);
1317 
1318    if (params->wm_prog_data) {
1319       blend_state_offset = blorp_emit_blend_state(batch, params);
1320    }
1321    color_calc_state_offset = blorp_emit_color_calc_state(batch, params);
1322    depth_stencil_state_offset = blorp_emit_depth_stencil_state(batch, params);
1323 
1324 #if GFX_VER == 6
1325    /* 3DSTATE_CC_STATE_POINTERS
1326     *
1327     * The pointer offsets are relative to
1328     * CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress.
1329     *
1330     * The HiZ op doesn't use BLEND_STATE or COLOR_CALC_STATE.
1331     *
1332     * The dynamic state emit helpers emit their own STATE_POINTERS packets on
1333     * gfx7+.  However, on gfx6 and earlier, they're all lumpped together in
1334     * one CC_STATE_POINTERS packet so we have to emit that here.
1335     */
1336    blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), cc) {
1337       cc.BLEND_STATEChange = params->wm_prog_data ? true : false;
1338       cc.ColorCalcStatePointerValid = true;
1339       cc.DEPTH_STENCIL_STATEChange = true;
1340       cc.PointertoBLEND_STATE = blend_state_offset;
1341       cc.ColorCalcStatePointer = color_calc_state_offset;
1342       cc.PointertoDEPTH_STENCIL_STATE = depth_stencil_state_offset;
1343    }
1344 #else
1345    (void)blend_state_offset;
1346    (void)color_calc_state_offset;
1347    (void)depth_stencil_state_offset;
1348 #endif
1349 
1350    UNUSED uint32_t mocs = isl_mocs(batch->blorp->isl_dev, 0, false);
1351 
1352 #if GFX_VER == 7
1353 #define CONSTANT_MOCS xs.ConstantBody.MOCS = mocs
1354 #else
1355 #define CONSTANT_MOCS
1356 #endif
1357    blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), xs) { CONSTANT_MOCS; }
1358 #if GFX_VER >= 7
1359    blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), xs) { CONSTANT_MOCS; }
1360    blorp_emit(batch, GENX(3DSTATE_CONSTANT_DS), xs) { CONSTANT_MOCS; }
1361 #endif
1362    blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), xs) { CONSTANT_MOCS; }
1363    blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), xs) { CONSTANT_MOCS; }
1364 #undef CONSTANT_MOCS
1365 
1366    if (params->src.enabled)
1367       blorp_emit_sampler_state_ps(batch);
1368 
1369    blorp_emit_3dstate_multisample(batch, params);
1370 
1371    blorp_emit(batch, GENX(3DSTATE_SAMPLE_MASK), mask) {
1372       mask.SampleMask = (1 << params->num_samples) - 1;
1373    }
1374 
1375    /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
1376     * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
1377     *
1378     *   [DevSNB] A pipeline flush must be programmed prior to a
1379     *   3DSTATE_VS command that causes the VS Function Enable to
1380     *   toggle. Pipeline flush can be executed by sending a PIPE_CONTROL
1381     *   command with CS stall bit set and a post sync operation.
1382     *
1383     * We've already done one at the start of the BLORP operation.
1384     */
1385    blorp_emit_vs_config(batch, params);
1386 #if GFX_VER >= 7
1387    blorp_emit(batch, GENX(3DSTATE_HS), hs);
1388    blorp_emit(batch, GENX(3DSTATE_TE), te);
1389    blorp_emit(batch, GENX(3DSTATE_DS), DS);
1390    blorp_emit(batch, GENX(3DSTATE_STREAMOUT), so);
1391 #endif
1392    blorp_emit(batch, GENX(3DSTATE_GS), gs);
1393 
1394    blorp_emit(batch, GENX(3DSTATE_CLIP), clip) {
1395       clip.PerspectiveDivideDisable = true;
1396    }
1397 
1398    blorp_emit_sf_config(batch, params, urb_deref_block_size);
1399    blorp_emit_ps_config(batch, params);
1400 
1401    blorp_emit_cc_viewport(batch);
1402 }
1403 
1404 /******** This is the end of the pipeline setup code ********/
1405 
1406 #endif /* GFX_VER >= 6 */
1407 
1408 #if GFX_VER >= 7
1409 static void
blorp_emit_memcpy(struct blorp_batch * batch,struct blorp_address dst,struct blorp_address src,uint32_t size)1410 blorp_emit_memcpy(struct blorp_batch *batch,
1411                   struct blorp_address dst,
1412                   struct blorp_address src,
1413                   uint32_t size)
1414 {
1415    assert(size % 4 == 0);
1416 
1417    for (unsigned dw = 0; dw < size; dw += 4) {
1418 #if GFX_VER >= 8
1419       blorp_emit(batch, GENX(MI_COPY_MEM_MEM), cp) {
1420          cp.DestinationMemoryAddress = dst;
1421          cp.SourceMemoryAddress = src;
1422       }
1423 #else
1424       /* IVB does not have a general purpose register for command streamer
1425        * commands. Therefore, we use an alternate temporary register.
1426        */
1427 #define BLORP_TEMP_REG 0x2440 /* GFX7_3DPRIM_BASE_VERTEX */
1428       blorp_emit(batch, GENX(MI_LOAD_REGISTER_MEM), load) {
1429          load.RegisterAddress = BLORP_TEMP_REG;
1430          load.MemoryAddress = src;
1431       }
1432       blorp_emit(batch, GENX(MI_STORE_REGISTER_MEM), store) {
1433          store.RegisterAddress = BLORP_TEMP_REG;
1434          store.MemoryAddress = dst;
1435       }
1436 #undef BLORP_TEMP_REG
1437 #endif
1438       dst.offset += 4;
1439       src.offset += 4;
1440    }
1441 }
1442 #endif
1443 
1444 static void
blorp_emit_surface_state(struct blorp_batch * batch,const struct blorp_surface_info * surface,UNUSED enum isl_aux_op aux_op,void * state,uint32_t state_offset,uint8_t color_write_disable,bool is_render_target)1445 blorp_emit_surface_state(struct blorp_batch *batch,
1446                          const struct blorp_surface_info *surface,
1447                          UNUSED enum isl_aux_op aux_op,
1448                          void *state, uint32_t state_offset,
1449                          uint8_t color_write_disable,
1450                          bool is_render_target)
1451 {
1452    const struct isl_device *isl_dev = batch->blorp->isl_dev;
1453    struct isl_surf surf = surface->surf;
1454 
1455    if (surf.dim == ISL_SURF_DIM_1D &&
1456        surf.dim_layout == ISL_DIM_LAYOUT_GFX4_2D) {
1457       assert(surf.logical_level0_px.height == 1);
1458       surf.dim = ISL_SURF_DIM_2D;
1459    }
1460 
1461    if (isl_aux_usage_has_hiz(surface->aux_usage)) {
1462       /* BLORP doesn't render with depth so we can't use HiZ */
1463       assert(!is_render_target);
1464       /* We can't reinterpret HiZ */
1465       assert(surface->surf.format == surface->view.format);
1466    }
1467 
1468    enum isl_aux_usage aux_usage = surface->aux_usage;
1469 
1470    /* On gfx12, implicit CCS has no aux buffer */
1471    bool use_aux_address = (aux_usage != ISL_AUX_USAGE_NONE) &&
1472                           (surface->aux_addr.buffer != NULL);
1473 
1474    isl_channel_mask_t write_disable_mask = 0;
1475    if (is_render_target && GFX_VER <= 5) {
1476       if (color_write_disable & BITFIELD_BIT(0))
1477          write_disable_mask |= ISL_CHANNEL_RED_BIT;
1478       if (color_write_disable & BITFIELD_BIT(1))
1479          write_disable_mask |= ISL_CHANNEL_GREEN_BIT;
1480       if (color_write_disable & BITFIELD_BIT(2))
1481          write_disable_mask |= ISL_CHANNEL_BLUE_BIT;
1482       if (color_write_disable & BITFIELD_BIT(3))
1483          write_disable_mask |= ISL_CHANNEL_ALPHA_BIT;
1484    }
1485 
1486    isl_surf_fill_state(batch->blorp->isl_dev, state,
1487                        .surf = &surf, .view = &surface->view,
1488                        .aux_surf = &surface->aux_surf, .aux_usage = aux_usage,
1489                        .address =
1490                           blorp_get_surface_address(batch, surface->addr),
1491                        .aux_address = !use_aux_address ? 0 :
1492                           blorp_get_surface_address(batch, surface->aux_addr),
1493                        .mocs = surface->addr.mocs,
1494                        .clear_color = surface->clear_color,
1495                        .write_disables = write_disable_mask);
1496 
1497    blorp_surface_reloc(batch, state_offset + isl_dev->ss.addr_offset,
1498                        surface->addr, 0);
1499 
1500    if (use_aux_address) {
1501       /* On gfx7 and prior, the bottom 12 bits of the MCS base address are
1502        * used to store other information.  This should be ok, however, because
1503        * surface buffer addresses are always 4K page alinged.
1504        */
1505       assert((surface->aux_addr.offset & 0xfff) == 0);
1506       uint32_t *aux_addr = state + isl_dev->ss.aux_addr_offset;
1507       blorp_surface_reloc(batch, state_offset + isl_dev->ss.aux_addr_offset,
1508                           surface->aux_addr, *aux_addr);
1509    }
1510 
1511    if (aux_usage != ISL_AUX_USAGE_NONE && surface->clear_color_addr.buffer) {
1512 #if GFX_VER >= 7
1513       /* Fast clears just whack the AUX surface and don't actually use the
1514        * clear color for anything.  We can avoid the MI memcpy on that case.
1515        */
1516       if (aux_op != ISL_AUX_OP_FAST_CLEAR) {
1517          struct blorp_address dst_addr = blorp_get_surface_base_address(batch);
1518          dst_addr.offset += state_offset + isl_dev->ss.clear_value_offset;
1519          blorp_emit_memcpy(batch, dst_addr, surface->clear_color_addr,
1520                            isl_dev->ss.clear_value_size);
1521       }
1522 #else
1523       unreachable("Fast clears are only supported on gfx7+");
1524 #endif
1525    }
1526 
1527    blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
1528 }
1529 
1530 static void
blorp_emit_null_surface_state(struct blorp_batch * batch,const struct blorp_surface_info * surface,uint32_t * state)1531 blorp_emit_null_surface_state(struct blorp_batch *batch,
1532                               const struct blorp_surface_info *surface,
1533                               uint32_t *state)
1534 {
1535    struct GENX(RENDER_SURFACE_STATE) ss = {
1536       .SurfaceType = SURFTYPE_NULL,
1537       .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
1538       .Width = surface->surf.logical_level0_px.width - 1,
1539       .Height = surface->surf.logical_level0_px.height - 1,
1540       .MIPCountLOD = surface->view.base_level,
1541       .MinimumArrayElement = surface->view.base_array_layer,
1542       .Depth = surface->view.array_len - 1,
1543       .RenderTargetViewExtent = surface->view.array_len - 1,
1544 #if GFX_VER >= 6
1545       .NumberofMultisamples = ffs(surface->surf.samples) - 1,
1546       .MOCS = isl_mocs(batch->blorp->isl_dev, 0, false),
1547 #endif
1548 
1549 #if GFX_VER >= 7
1550       .SurfaceArray = surface->surf.dim != ISL_SURF_DIM_3D,
1551 #endif
1552 
1553 #if GFX_VER >= 8
1554       .TileMode = YMAJOR,
1555 #else
1556       .TiledSurface = true,
1557 #endif
1558    };
1559 
1560    GENX(RENDER_SURFACE_STATE_pack)(NULL, state, &ss);
1561 
1562    blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
1563 }
1564 
1565 static uint32_t
blorp_setup_binding_table(struct blorp_batch * batch,const struct blorp_params * params)1566 blorp_setup_binding_table(struct blorp_batch *batch,
1567                            const struct blorp_params *params)
1568 {
1569    const struct isl_device *isl_dev = batch->blorp->isl_dev;
1570    uint32_t surface_offsets[2], bind_offset = 0;
1571    void *surface_maps[2];
1572 
1573    if (params->use_pre_baked_binding_table) {
1574       bind_offset = params->pre_baked_binding_table_offset;
1575    } else {
1576       unsigned num_surfaces = 1 + params->src.enabled;
1577       if (!blorp_alloc_binding_table(batch, num_surfaces,
1578                                      isl_dev->ss.size, isl_dev->ss.align,
1579                                      &bind_offset, surface_offsets, surface_maps))
1580          return 0;
1581 
1582       if (params->dst.enabled) {
1583          blorp_emit_surface_state(batch, &params->dst,
1584                                   params->fast_clear_op,
1585                                   surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
1586                                   surface_offsets[BLORP_RENDERBUFFER_BT_INDEX],
1587                                   params->color_write_disable, true);
1588       } else {
1589          assert(params->depth.enabled || params->stencil.enabled);
1590          const struct blorp_surface_info *surface =
1591             params->depth.enabled ? &params->depth : &params->stencil;
1592          blorp_emit_null_surface_state(batch, surface,
1593                                        surface_maps[BLORP_RENDERBUFFER_BT_INDEX]);
1594       }
1595 
1596       if (params->src.enabled) {
1597          blorp_emit_surface_state(batch, &params->src,
1598                                   params->fast_clear_op,
1599                                   surface_maps[BLORP_TEXTURE_BT_INDEX],
1600                                   surface_offsets[BLORP_TEXTURE_BT_INDEX],
1601                                   0, false);
1602       }
1603    }
1604 
1605    return bind_offset;
1606 }
1607 
1608 static void
blorp_emit_btp(struct blorp_batch * batch,uint32_t bind_offset)1609 blorp_emit_btp(struct blorp_batch *batch, uint32_t bind_offset)
1610 {
1611 #if GFX_VER >= 7
1612    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), bt);
1613    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_HS), bt);
1614    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_DS), bt);
1615    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_GS), bt);
1616 
1617    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), bt) {
1618       bt.PointertoPSBindingTable =
1619          blorp_binding_table_offset_to_pointer(batch, bind_offset);
1620    }
1621 #elif GFX_VER >= 6
1622    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), bt) {
1623       bt.PSBindingTableChange = true;
1624       bt.PointertoPSBindingTable =
1625          blorp_binding_table_offset_to_pointer(batch, bind_offset);
1626    }
1627 #else
1628    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), bt) {
1629       bt.PointertoPSBindingTable =
1630          blorp_binding_table_offset_to_pointer(batch, bind_offset);
1631    }
1632 #endif
1633 }
1634 
1635 static void
blorp_emit_depth_stencil_config(struct blorp_batch * batch,const struct blorp_params * params)1636 blorp_emit_depth_stencil_config(struct blorp_batch *batch,
1637                                 const struct blorp_params *params)
1638 {
1639    const struct isl_device *isl_dev = batch->blorp->isl_dev;
1640 
1641    uint32_t *dw = blorp_emit_dwords(batch, isl_dev->ds.size / 4);
1642    if (dw == NULL)
1643       return;
1644 
1645    struct isl_depth_stencil_hiz_emit_info info = { };
1646 
1647    if (params->depth.enabled) {
1648       info.view = &params->depth.view;
1649       info.mocs = params->depth.addr.mocs;
1650    } else if (params->stencil.enabled) {
1651       info.view = &params->stencil.view;
1652       info.mocs = params->stencil.addr.mocs;
1653    } else {
1654       info.mocs = isl_mocs(isl_dev, 0, false);
1655    }
1656 
1657    if (params->depth.enabled) {
1658       info.depth_surf = &params->depth.surf;
1659 
1660       info.depth_address =
1661          blorp_emit_reloc(batch, dw + isl_dev->ds.depth_offset / 4,
1662                           params->depth.addr, 0);
1663 
1664       info.hiz_usage = params->depth.aux_usage;
1665       if (isl_aux_usage_has_hiz(info.hiz_usage)) {
1666          info.hiz_surf = &params->depth.aux_surf;
1667 
1668          struct blorp_address hiz_address = params->depth.aux_addr;
1669 #if GFX_VER == 6
1670          /* Sandy bridge hardware does not technically support mipmapped HiZ.
1671           * However, we have a special layout that allows us to make it work
1672           * anyway by manually offsetting to the specified miplevel.
1673           */
1674          assert(info.hiz_surf->dim_layout == ISL_DIM_LAYOUT_GFX6_STENCIL_HIZ);
1675          uint64_t offset_B;
1676          isl_surf_get_image_offset_B_tile_sa(info.hiz_surf,
1677                                              info.view->base_level, 0, 0,
1678                                              &offset_B, NULL, NULL);
1679          hiz_address.offset += offset_B;
1680 #endif
1681 
1682          info.hiz_address =
1683             blorp_emit_reloc(batch, dw + isl_dev->ds.hiz_offset / 4,
1684                              hiz_address, 0);
1685 
1686          info.depth_clear_value = params->depth.clear_color.f32[0];
1687       }
1688    }
1689 
1690    if (params->stencil.enabled) {
1691       info.stencil_surf = &params->stencil.surf;
1692 
1693       info.stencil_aux_usage = params->stencil.aux_usage;
1694       struct blorp_address stencil_address = params->stencil.addr;
1695 #if GFX_VER == 6
1696       /* Sandy bridge hardware does not technically support mipmapped stencil.
1697        * However, we have a special layout that allows us to make it work
1698        * anyway by manually offsetting to the specified miplevel.
1699        */
1700       assert(info.stencil_surf->dim_layout == ISL_DIM_LAYOUT_GFX6_STENCIL_HIZ);
1701       uint64_t offset_B;
1702       isl_surf_get_image_offset_B_tile_sa(info.stencil_surf,
1703                                           info.view->base_level, 0, 0,
1704                                           &offset_B, NULL, NULL);
1705       stencil_address.offset += offset_B;
1706 #endif
1707 
1708       info.stencil_address =
1709          blorp_emit_reloc(batch, dw + isl_dev->ds.stencil_offset / 4,
1710                           stencil_address, 0);
1711    }
1712 
1713    isl_emit_depth_stencil_hiz_s(isl_dev, dw, &info);
1714 }
1715 
1716 #if GFX_VER >= 8
1717 /* Emits the Optimized HiZ sequence specified in the BDW+ PRMs. The
1718  * depth/stencil buffer extents are ignored to handle APIs which perform
1719  * clearing operations without such information.
1720  * */
1721 static void
blorp_emit_gfx8_hiz_op(struct blorp_batch * batch,const struct blorp_params * params)1722 blorp_emit_gfx8_hiz_op(struct blorp_batch *batch,
1723                        const struct blorp_params *params)
1724 {
1725    /* We should be performing an operation on a depth or stencil buffer.
1726     */
1727    assert(params->depth.enabled || params->stencil.enabled);
1728 
1729    blorp_measure_start(batch, params);
1730 
1731    /* The stencil buffer should only be enabled if a fast clear operation is
1732     * requested.
1733     */
1734    if (params->stencil.enabled)
1735       assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR);
1736 
1737    /* From the BDW PRM Volume 2, 3DSTATE_WM_HZ_OP:
1738     *
1739     * 3DSTATE_MULTISAMPLE packet must be used prior to this packet to change
1740     * the Number of Multisamples. This packet must not be used to change
1741     * Number of Multisamples in a rendering sequence.
1742     *
1743     * Since HIZ may be the first thing in a batch buffer, play safe and always
1744     * emit 3DSTATE_MULTISAMPLE.
1745     */
1746    blorp_emit_3dstate_multisample(batch, params);
1747 
1748    /* From the BDW PRM Volume 7, Depth Buffer Clear:
1749     *
1750     *    The clear value must be between the min and max depth values
1751     *    (inclusive) defined in the CC_VIEWPORT. If the depth buffer format is
1752     *    D32_FLOAT, then +/-DENORM values are also allowed.
1753     *
1754     * Set the bounds to match our hardware limits, [0.0, 1.0].
1755     */
1756    if (params->depth.enabled && params->hiz_op == ISL_AUX_OP_FAST_CLEAR) {
1757       assert(params->depth.clear_color.f32[0] >= 0.0f);
1758       assert(params->depth.clear_color.f32[0] <= 1.0f);
1759       blorp_emit_cc_viewport(batch);
1760    }
1761 
1762    /* According to the SKL PRM formula for WM_INT::ThreadDispatchEnable, the
1763     * 3DSTATE_WM::ForceThreadDispatchEnable field can force WM thread dispatch
1764     * even when WM_HZ_OP is active.  However, WM thread dispatch is normally
1765     * disabled for HiZ ops and it appears that force-enabling it can lead to
1766     * GPU hangs on at least Skylake.  Since we don't know the current state of
1767     * the 3DSTATE_WM packet, just emit a dummy one prior to 3DSTATE_WM_HZ_OP.
1768     */
1769    blorp_emit(batch, GENX(3DSTATE_WM), wm);
1770 
1771    /* If we can't alter the depth stencil config and multiple layers are
1772     * involved, the HiZ op will fail. This is because the op requires that a
1773     * new config is emitted for each additional layer.
1774     */
1775    if (batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL) {
1776       assert(params->num_layers <= 1);
1777    } else {
1778       blorp_emit_depth_stencil_config(batch, params);
1779    }
1780 
1781    /* TODO - If we ever start using 3DSTATE_WM_HZ_OP::StencilBufferResolveEnable
1782     * we need to implement required steps, flushes documented in Wa_1605967699.
1783     */
1784    blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp) {
1785       switch (params->hiz_op) {
1786       case ISL_AUX_OP_FAST_CLEAR:
1787          hzp.StencilBufferClearEnable = params->stencil.enabled;
1788          hzp.DepthBufferClearEnable = params->depth.enabled;
1789          hzp.StencilClearValue = params->stencil_ref;
1790          hzp.FullSurfaceDepthandStencilClear = params->full_surface_hiz_op;
1791          break;
1792       case ISL_AUX_OP_FULL_RESOLVE:
1793          assert(params->full_surface_hiz_op);
1794          hzp.DepthBufferResolveEnable = true;
1795          break;
1796       case ISL_AUX_OP_AMBIGUATE:
1797          assert(params->full_surface_hiz_op);
1798          hzp.HierarchicalDepthBufferResolveEnable = true;
1799          break;
1800       case ISL_AUX_OP_PARTIAL_RESOLVE:
1801       case ISL_AUX_OP_NONE:
1802          unreachable("Invalid HIZ op");
1803       }
1804 
1805       hzp.NumberofMultisamples = ffs(params->num_samples) - 1;
1806       hzp.SampleMask = 0xFFFF;
1807 
1808       /* Due to a hardware issue, this bit MBZ */
1809       assert(hzp.ScissorRectangleEnable == false);
1810 
1811       /* Contrary to the HW docs both fields are inclusive */
1812       hzp.ClearRectangleXMin = params->x0;
1813       hzp.ClearRectangleYMin = params->y0;
1814 
1815       /* Contrary to the HW docs both fields are exclusive */
1816       hzp.ClearRectangleXMax = params->x1;
1817       hzp.ClearRectangleYMax = params->y1;
1818    }
1819 
1820    /* PIPE_CONTROL w/ all bits clear except for “Post-Sync Operation” must set
1821     * to “Write Immediate Data” enabled.
1822     */
1823    blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1824       pc.PostSyncOperation = WriteImmediateData;
1825       pc.Address = blorp_get_workaround_address(batch);
1826    }
1827 
1828    blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp);
1829 
1830    blorp_measure_end(batch, params);
1831 }
1832 #endif
1833 
1834 static bool
blorp_uses_bti_rt_writes(const struct blorp_batch * batch,const struct blorp_params * params)1835 blorp_uses_bti_rt_writes(const struct blorp_batch *batch, const struct blorp_params *params)
1836 {
1837    if (batch->flags & (BLORP_BATCH_USE_BLITTER | BLORP_BATCH_USE_COMPUTE))
1838       return false;
1839 
1840    /* HIZ clears use WM_HZ ops rather than a clear shader using RT writes. */
1841    return params->hiz_op == ISL_AUX_OP_NONE;
1842 }
1843 
1844 static void
blorp_exec_3d(struct blorp_batch * batch,const struct blorp_params * params)1845 blorp_exec_3d(struct blorp_batch *batch, const struct blorp_params *params)
1846 {
1847 #if GFX_VER >= 8
1848    if (params->hiz_op != ISL_AUX_OP_NONE) {
1849       blorp_emit_gfx8_hiz_op(batch, params);
1850       return;
1851    }
1852 #endif
1853 
1854    blorp_emit_vertex_buffers(batch, params);
1855    blorp_emit_vertex_elements(batch, params);
1856 
1857    blorp_emit_pipeline(batch, params);
1858 
1859    blorp_emit_btp(batch, blorp_setup_binding_table(batch, params));
1860 
1861    if (!(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
1862       blorp_emit_depth_stencil_config(batch, params);
1863 
1864    const UNUSED bool use_tbimr = false;
1865    blorp_emit_pre_draw(batch, params);
1866    blorp_emit(batch, GENX(3DPRIMITIVE), prim) {
1867       prim.VertexAccessType = SEQUENTIAL;
1868       prim.PrimitiveTopologyType = _3DPRIM_RECTLIST;
1869 #if GFX_VER >= 7
1870       prim.PredicateEnable = batch->flags & BLORP_BATCH_PREDICATE_ENABLE;
1871 #endif
1872       prim.VertexCountPerInstance = 3;
1873       prim.InstanceCount = params->num_layers;
1874    }
1875    blorp_emit_post_draw(batch, params);
1876 }
1877 
1878 #if GFX_VER >= 7
1879 
1880 static void
blorp_get_compute_push_const(struct blorp_batch * batch,const struct blorp_params * params,uint32_t threads,uint32_t * state_offset,unsigned * state_size)1881 blorp_get_compute_push_const(struct blorp_batch *batch,
1882                              const struct blorp_params *params,
1883                              uint32_t threads,
1884                              uint32_t *state_offset,
1885                              unsigned *state_size)
1886 {
1887    const struct elk_cs_prog_data *cs_prog_data = params->cs_prog_data;
1888    const unsigned push_const_size =
1889       ALIGN(elk_cs_push_const_total_size(cs_prog_data, threads), 64);
1890    assert(cs_prog_data->push.cross_thread.size +
1891           cs_prog_data->push.per_thread.size == sizeof(params->wm_inputs));
1892 
1893    if (push_const_size == 0) {
1894       *state_offset = 0;
1895       *state_size = 0;
1896       return;
1897    }
1898 
1899    uint32_t push_const_offset;
1900    uint32_t *push_const =
1901       blorp_alloc_dynamic_state(batch, push_const_size, 64,
1902                                 &push_const_offset);
1903    memset(push_const, 0x0, push_const_size);
1904 
1905    void *dst = push_const;
1906    const void *src = (char *)&params->wm_inputs;
1907 
1908    if (cs_prog_data->push.cross_thread.size > 0) {
1909       memcpy(dst, src, cs_prog_data->push.cross_thread.size);
1910       dst += cs_prog_data->push.cross_thread.size;
1911       src += cs_prog_data->push.cross_thread.size;
1912    }
1913 
1914    if (cs_prog_data->push.per_thread.size > 0) {
1915       for (unsigned t = 0; t < threads; t++) {
1916          memcpy(dst, src, (cs_prog_data->push.per_thread.dwords - 1) * 4);
1917 
1918          uint32_t *subgroup_id = dst + cs_prog_data->push.per_thread.size - 4;
1919          *subgroup_id = t;
1920 
1921          dst += cs_prog_data->push.per_thread.size;
1922       }
1923    }
1924 
1925    *state_offset = push_const_offset;
1926    *state_size = push_const_size;
1927 }
1928 
1929 #endif /* GFX_VER >= 7 */
1930 
1931 static void
blorp_exec_compute(struct blorp_batch * batch,const struct blorp_params * params)1932 blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
1933 {
1934    assert(!(batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
1935    assert(params->hiz_op == ISL_AUX_OP_NONE);
1936 
1937    blorp_measure_start(batch, params);
1938 
1939 #if GFX_VER >= 7
1940    const struct intel_device_info *devinfo = batch->blorp->compiler->elk->devinfo;
1941    const struct elk_cs_prog_data *cs_prog_data = params->cs_prog_data;
1942    const struct elk_stage_prog_data *prog_data = &cs_prog_data->base;
1943    const struct intel_cs_dispatch_info dispatch =
1944       elk_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
1945 
1946    uint32_t group_x0 = params->x0 / cs_prog_data->local_size[0];
1947    uint32_t group_y0 = params->y0 / cs_prog_data->local_size[1];
1948    uint32_t group_z0 = params->dst.z_offset;
1949    uint32_t group_x1 = DIV_ROUND_UP(params->x1, cs_prog_data->local_size[0]);
1950    uint32_t group_y1 = DIV_ROUND_UP(params->y1, cs_prog_data->local_size[1]);
1951    assert(params->num_layers >= 1);
1952    uint32_t group_z1 = params->dst.z_offset + params->num_layers;
1953    assert(cs_prog_data->local_size[2] == 1);
1954 
1955 #endif /* GFX_VER >= 7 */
1956 
1957 #if GFX_VER >= 7
1958 
1959    /* The MEDIA_VFE_STATE documentation for Gfx8+ says:
1960     *
1961     * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
1962     *  the only bits that are changed are scoreboard related: Scoreboard
1963     *  Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
1964     *  these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient."
1965     *
1966     * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL",
1967     * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL.
1968     */
1969    blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1970       pc.CommandStreamerStallEnable = true;
1971       pc.StallAtPixelScoreboard = true;
1972    }
1973 
1974    blorp_emit(batch, GENX(MEDIA_VFE_STATE), vfe) {
1975       assert(prog_data->total_scratch == 0);
1976       vfe.MaximumNumberofThreads =
1977          devinfo->max_cs_threads * devinfo->subslice_total - 1;
1978       vfe.NumberofURBEntries = GFX_VER >= 8 ? 2 : 0;
1979       vfe.ResetGatewayTimer =
1980          Resettingrelativetimerandlatchingtheglobaltimestamp;
1981       vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol;
1982 #if GFX_VER == 7
1983       vfe.GPGPUMode = true;
1984 #endif
1985       vfe.URBEntryAllocationSize = GFX_VER >= 8 ? 2 : 0;
1986 
1987       const uint32_t vfe_curbe_allocation =
1988          ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
1989                cs_prog_data->push.cross_thread.regs, 2);
1990       vfe.CURBEAllocationSize = vfe_curbe_allocation;
1991    }
1992 
1993    uint32_t push_const_offset;
1994    unsigned push_const_size;
1995    blorp_get_compute_push_const(batch, params, dispatch.threads,
1996                                 &push_const_offset, &push_const_size);
1997 
1998    blorp_emit(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
1999       curbe.CURBETotalDataLength = push_const_size;
2000       curbe.CURBEDataStartAddress = push_const_offset;
2001    }
2002 
2003    uint32_t surfaces_offset = blorp_setup_binding_table(batch, params);
2004 
2005    uint32_t samplers_offset =
2006       params->src.enabled ? blorp_emit_sampler_state(batch) : 0;
2007 
2008    struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
2009       .KernelStartPointer = params->cs_prog_kernel,
2010       .SamplerStatePointer = samplers_offset,
2011       .SamplerCount = params->src.enabled ? 1 : 0,
2012       .BindingTableEntryCount = params->src.enabled ? 2 : 1,
2013       .BindingTablePointer = surfaces_offset,
2014       .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
2015       .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
2016       .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER,
2017                                                              prog_data->total_shared),
2018       .BarrierEnable = cs_prog_data->uses_barrier,
2019 #if GFX_VER >= 8 || GFX_VERx10 == 75
2020       .CrossThreadConstantDataReadLength =
2021          cs_prog_data->push.cross_thread.regs,
2022 #endif
2023    };
2024 
2025    uint32_t idd_offset;
2026    uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
2027    void *state = blorp_alloc_dynamic_state(batch, size, 64, &idd_offset);
2028    GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, state, &idd);
2029 
2030    blorp_emit(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
2031       mid.InterfaceDescriptorTotalLength        = size;
2032       mid.InterfaceDescriptorDataStartAddress   = idd_offset;
2033    }
2034 
2035    blorp_emit(batch, GENX(GPGPU_WALKER), ggw) {
2036       ggw.SIMDSize                     = dispatch.simd_size / 16;
2037       ggw.ThreadDepthCounterMaximum    = 0;
2038       ggw.ThreadHeightCounterMaximum   = 0;
2039       ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
2040       ggw.ThreadGroupIDStartingX       = group_x0;
2041       ggw.ThreadGroupIDStartingY       = group_y0;
2042 #if GFX_VER >= 8
2043       ggw.ThreadGroupIDStartingResumeZ = group_z0;
2044 #else
2045       ggw.ThreadGroupIDStartingZ       = group_z0;
2046 #endif
2047       ggw.ThreadGroupIDXDimension      = group_x1;
2048       ggw.ThreadGroupIDYDimension      = group_y1;
2049       ggw.ThreadGroupIDZDimension      = group_z1;
2050       ggw.RightExecutionMask           = dispatch.right_mask;
2051       ggw.BottomExecutionMask          = 0xffffffff;
2052    }
2053 
2054 #else /* GFX_VER >= 7 */
2055 
2056    unreachable("Compute blorp is not supported on SNB and earlier");
2057 
2058 #endif /* GFX_VER >= 7 */
2059 
2060    blorp_measure_end(batch, params);
2061 }
2062 
2063 /**
2064  * \brief Execute a blit or render pass operation.
2065  *
2066  * To execute the operation, this function manually constructs and emits a
2067  * batch to draw a rectangle primitive. The batchbuffer is flushed before
2068  * constructing and after emitting the batch.
2069  *
2070  * This function alters no GL state.
2071  */
2072 static void
blorp_exec(struct blorp_batch * batch,const struct blorp_params * params)2073 blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
2074 {
2075    /* Not supported in Gfx versions that use Elk. */
2076    assert((batch->flags & BLORP_BATCH_USE_BLITTER) == 0);
2077 
2078    if (batch->flags & BLORP_BATCH_USE_COMPUTE) {
2079       blorp_exec_compute(batch, params);
2080    } else {
2081       blorp_exec_3d(batch, params);
2082    }
2083 }
2084 
2085 #endif /* BLORP_GENX_EXEC_ELK_H */
2086