xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/iris/iris_state.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2017 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20  * DEALINGS IN THE SOFTWARE.
21  */
22 
23 /**
24  * @file iris_state.c
25  *
26  * ============================= GENXML CODE =============================
27  *              [This file is compiled once per generation.]
28  * =======================================================================
29  *
30  * This is the main state upload code.
31  *
32  * Gallium uses Constant State Objects, or CSOs, for most state.  Large,
33  * complex, or highly reusable state can be created once, and bound and
34  * rebound multiple times.  This is modeled with the pipe->create_*_state()
35  * and pipe->bind_*_state() hooks.  Highly dynamic or inexpensive state is
36  * streamed out on the fly, via pipe->set_*_state() hooks.
37  *
38  * OpenGL involves frequently mutating context state, which is mirrored in
39  * core Mesa by highly mutable data structures.  However, most applications
40  * typically draw the same things over and over - from frame to frame, most
41  * of the same objects are still visible and need to be redrawn.  So, rather
42  * than inventing new state all the time, applications usually mutate to swap
43  * between known states that we've seen before.
44  *
45  * Gallium isolates us from this mutation by tracking API state, and
46  * distilling it into a set of Constant State Objects, or CSOs.  Large,
47  * complex, or typically reusable state can be created once, then reused
48  * multiple times.  Drivers can create and store their own associated data.
49  * This create/bind model corresponds to the pipe->create_*_state() and
50  * pipe->bind_*_state() driver hooks.
51  *
52  * Some state is cheap to create, or expected to be highly dynamic.  Rather
53  * than creating and caching piles of CSOs for these, Gallium simply streams
54  * them out, via the pipe->set_*_state() driver hooks.
55  *
56  * To reduce draw time overhead, we try to compute as much state at create
57  * time as possible.  Wherever possible, we translate the Gallium pipe state
58  * to 3DSTATE commands, and store those commands in the CSO.  At draw time,
59  * we can simply memcpy them into a batch buffer.
60  *
61  * No hardware matches the abstraction perfectly, so some commands require
62  * information from multiple CSOs.  In this case, we can store two copies
63  * of the packet (one in each CSO), and simply | together their DWords at
64  * draw time.  Sometimes the second set is trivial (one or two fields), so
65  * we simply pack it at draw time.
66  *
67  * There are two main components in the file below.  First, the CSO hooks
68  * create/bind/track state.  The second are the draw-time upload functions,
69  * iris_upload_render_state() and iris_upload_compute_state(), which read
70  * the context state and emit the commands into the actual batch.
71  */
72 
73 #include <stdio.h>
74 #include <errno.h>
75 
76 #ifdef HAVE_VALGRIND
77 #include <valgrind.h>
78 #include <memcheck.h>
79 #define VG(x) x
80 #else
81 #define VG(x)
82 #endif
83 
84 #include "pipe/p_defines.h"
85 #include "pipe/p_state.h"
86 #include "pipe/p_context.h"
87 #include "pipe/p_screen.h"
88 #include "util/u_dual_blend.h"
89 #include "util/u_inlines.h"
90 #include "util/format/u_format.h"
91 #include "util/u_framebuffer.h"
92 #include "util/u_transfer.h"
93 #include "util/u_upload_mgr.h"
94 #include "util/u_viewport.h"
95 #include "util/u_memory.h"
96 #include "util/u_trace_gallium.h"
97 #include "nir.h"
98 #include "intel/common/intel_aux_map.h"
99 #include "intel/common/intel_compute_slm.h"
100 #include "intel/common/intel_l3_config.h"
101 #include "intel/common/intel_sample_positions.h"
102 #include "intel/ds/intel_tracepoints.h"
103 #include "iris_batch.h"
104 #include "iris_context.h"
105 #include "iris_defines.h"
106 #include "iris_pipe.h"
107 #include "iris_resource.h"
108 #include "iris_utrace.h"
109 
110 #include "iris_genx_macros.h"
111 
112 #if GFX_VER >= 9
113 #include "intel/compiler/brw_compiler.h"
114 #include "intel/common/intel_genX_state_brw.h"
115 #else
116 #include "intel/compiler/elk/elk_compiler.h"
117 #include "intel/common/intel_genX_state_elk.h"
118 #endif
119 
120 #include "intel/common/intel_guardband.h"
121 #include "intel/common/intel_pixel_hash.h"
122 #include "intel/common/intel_tiled_render.h"
123 
124 /**
125  * Statically assert that PIPE_* enums match the hardware packets.
126  * (As long as they match, we don't need to translate them.)
127  */
pipe_asserts()128 UNUSED static void pipe_asserts()
129 {
130 #define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
131 
132    /* pipe_logicop happens to match the hardware. */
133    PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
134    PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
135    PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
136    PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
137    PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
138    PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
139    PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
140    PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
141    PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
142    PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
143    PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
144    PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
145    PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
146    PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
147    PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
148    PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
149 
150    /* pipe_blend_func happens to match the hardware. */
151    PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
152    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
153    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
154    PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
155    PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
156    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
157    PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
158    PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
159    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
160    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
161    PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
162    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
163    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
164    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
165    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
166    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
167    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
168    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
169    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
170 
171    /* pipe_blend_func happens to match the hardware. */
172    PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
173    PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
174    PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
175    PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
176    PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
177 
178    /* pipe_stencil_op happens to match the hardware. */
179    PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
180    PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
181    PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
182    PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
183    PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
184    PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
185    PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
186    PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
187 
188    /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
189    PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
190    PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
191 #undef PIPE_ASSERT
192 }
193 
194 static unsigned
translate_prim_type(enum mesa_prim prim,uint8_t verts_per_patch)195 translate_prim_type(enum mesa_prim prim, uint8_t verts_per_patch)
196 {
197    static const unsigned map[] = {
198       [MESA_PRIM_POINTS]                   = _3DPRIM_POINTLIST,
199       [MESA_PRIM_LINES]                    = _3DPRIM_LINELIST,
200       [MESA_PRIM_LINE_LOOP]                = _3DPRIM_LINELOOP,
201       [MESA_PRIM_LINE_STRIP]               = _3DPRIM_LINESTRIP,
202       [MESA_PRIM_TRIANGLES]                = _3DPRIM_TRILIST,
203       [MESA_PRIM_TRIANGLE_STRIP]           = _3DPRIM_TRISTRIP,
204       [MESA_PRIM_TRIANGLE_FAN]             = _3DPRIM_TRIFAN,
205       [MESA_PRIM_QUADS]                    = _3DPRIM_QUADLIST,
206       [MESA_PRIM_QUAD_STRIP]               = _3DPRIM_QUADSTRIP,
207       [MESA_PRIM_POLYGON]                  = _3DPRIM_POLYGON,
208       [MESA_PRIM_LINES_ADJACENCY]          = _3DPRIM_LINELIST_ADJ,
209       [MESA_PRIM_LINE_STRIP_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
210       [MESA_PRIM_TRIANGLES_ADJACENCY]      = _3DPRIM_TRILIST_ADJ,
211       [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
212       [MESA_PRIM_PATCHES]                  = _3DPRIM_PATCHLIST_1 - 1,
213    };
214 
215    return map[prim] + (prim == MESA_PRIM_PATCHES ? verts_per_patch : 0);
216 }
217 
218 static unsigned
translate_compare_func(enum pipe_compare_func pipe_func)219 translate_compare_func(enum pipe_compare_func pipe_func)
220 {
221    static const unsigned map[] = {
222       [PIPE_FUNC_NEVER]    = COMPAREFUNCTION_NEVER,
223       [PIPE_FUNC_LESS]     = COMPAREFUNCTION_LESS,
224       [PIPE_FUNC_EQUAL]    = COMPAREFUNCTION_EQUAL,
225       [PIPE_FUNC_LEQUAL]   = COMPAREFUNCTION_LEQUAL,
226       [PIPE_FUNC_GREATER]  = COMPAREFUNCTION_GREATER,
227       [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
228       [PIPE_FUNC_GEQUAL]   = COMPAREFUNCTION_GEQUAL,
229       [PIPE_FUNC_ALWAYS]   = COMPAREFUNCTION_ALWAYS,
230    };
231    return map[pipe_func];
232 }
233 
234 static unsigned
translate_shadow_func(enum pipe_compare_func pipe_func)235 translate_shadow_func(enum pipe_compare_func pipe_func)
236 {
237    /* Gallium specifies the result of shadow comparisons as:
238     *
239     *    1 if ref <op> texel,
240     *    0 otherwise.
241     *
242     * The hardware does:
243     *
244     *    0 if texel <op> ref,
245     *    1 otherwise.
246     *
247     * So we need to flip the operator and also negate.
248     */
249    static const unsigned map[] = {
250       [PIPE_FUNC_NEVER]    = PREFILTEROP_ALWAYS,
251       [PIPE_FUNC_LESS]     = PREFILTEROP_LEQUAL,
252       [PIPE_FUNC_EQUAL]    = PREFILTEROP_NOTEQUAL,
253       [PIPE_FUNC_LEQUAL]   = PREFILTEROP_LESS,
254       [PIPE_FUNC_GREATER]  = PREFILTEROP_GEQUAL,
255       [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
256       [PIPE_FUNC_GEQUAL]   = PREFILTEROP_GREATER,
257       [PIPE_FUNC_ALWAYS]   = PREFILTEROP_NEVER,
258    };
259    return map[pipe_func];
260 }
261 
262 static unsigned
translate_cull_mode(unsigned pipe_face)263 translate_cull_mode(unsigned pipe_face)
264 {
265    static const unsigned map[4] = {
266       [PIPE_FACE_NONE]           = CULLMODE_NONE,
267       [PIPE_FACE_FRONT]          = CULLMODE_FRONT,
268       [PIPE_FACE_BACK]           = CULLMODE_BACK,
269       [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
270    };
271    return map[pipe_face];
272 }
273 
274 static unsigned
translate_fill_mode(unsigned pipe_polymode)275 translate_fill_mode(unsigned pipe_polymode)
276 {
277    static const unsigned map[4] = {
278       [PIPE_POLYGON_MODE_FILL]           = FILL_MODE_SOLID,
279       [PIPE_POLYGON_MODE_LINE]           = FILL_MODE_WIREFRAME,
280       [PIPE_POLYGON_MODE_POINT]          = FILL_MODE_POINT,
281       [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
282    };
283    return map[pipe_polymode];
284 }
285 
286 static unsigned
translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)287 translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
288 {
289    static const unsigned map[] = {
290       [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
291       [PIPE_TEX_MIPFILTER_LINEAR]  = MIPFILTER_LINEAR,
292       [PIPE_TEX_MIPFILTER_NONE]    = MIPFILTER_NONE,
293    };
294    return map[pipe_mip];
295 }
296 
297 static uint32_t
translate_wrap(unsigned pipe_wrap)298 translate_wrap(unsigned pipe_wrap)
299 {
300    static const unsigned map[] = {
301       [PIPE_TEX_WRAP_REPEAT]                 = TCM_WRAP,
302       [PIPE_TEX_WRAP_CLAMP]                  = TCM_HALF_BORDER,
303       [PIPE_TEX_WRAP_CLAMP_TO_EDGE]          = TCM_CLAMP,
304       [PIPE_TEX_WRAP_CLAMP_TO_BORDER]        = TCM_CLAMP_BORDER,
305       [PIPE_TEX_WRAP_MIRROR_REPEAT]          = TCM_MIRROR,
306       [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE]   = TCM_MIRROR_ONCE,
307 
308       /* These are unsupported. */
309       [PIPE_TEX_WRAP_MIRROR_CLAMP]           = -1,
310       [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
311    };
312    return map[pipe_wrap];
313 }
314 
315 /**
316  * Allocate space for some indirect state.
317  *
318  * Return a pointer to the map (to fill it out) and a state ref (for
319  * referring to the state in GPU commands).
320  */
321 static void *
upload_state(struct u_upload_mgr * uploader,struct iris_state_ref * ref,unsigned size,unsigned alignment)322 upload_state(struct u_upload_mgr *uploader,
323              struct iris_state_ref *ref,
324              unsigned size,
325              unsigned alignment)
326 {
327    void *p = NULL;
328    u_upload_alloc(uploader, 0, size, alignment, &ref->offset, &ref->res, &p);
329    return p;
330 }
331 
332 /**
333  * Stream out temporary/short-lived state.
334  *
335  * This allocates space, pins the BO, and includes the BO address in the
336  * returned offset (which works because all state lives in 32-bit memory
337  * zones).
338  */
339 static uint32_t *
stream_state(struct iris_batch * batch,struct u_upload_mgr * uploader,struct pipe_resource ** out_res,unsigned size,unsigned alignment,uint32_t * out_offset)340 stream_state(struct iris_batch *batch,
341              struct u_upload_mgr *uploader,
342              struct pipe_resource **out_res,
343              unsigned size,
344              unsigned alignment,
345              uint32_t *out_offset)
346 {
347    void *ptr = NULL;
348 
349    u_upload_alloc(uploader, 0, size, alignment, out_offset, out_res, &ptr);
350 
351    struct iris_bo *bo = iris_resource_bo(*out_res);
352    iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
353 
354    iris_record_state_size(batch->state_sizes,
355                           bo->address + *out_offset, size);
356 
357    *out_offset += iris_bo_offset_from_base_address(bo);
358 
359    return ptr;
360 }
361 
362 /**
363  * stream_state() + memcpy.
364  */
365 static uint32_t
emit_state(struct iris_batch * batch,struct u_upload_mgr * uploader,struct pipe_resource ** out_res,const void * data,unsigned size,unsigned alignment)366 emit_state(struct iris_batch *batch,
367            struct u_upload_mgr *uploader,
368            struct pipe_resource **out_res,
369            const void *data,
370            unsigned size,
371            unsigned alignment)
372 {
373    unsigned offset = 0;
374    uint32_t *map =
375       stream_state(batch, uploader, out_res, size, alignment, &offset);
376 
377    if (map)
378       memcpy(map, data, size);
379 
380    return offset;
381 }
382 
383 /**
384  * Did field 'x' change between 'old_cso' and 'new_cso'?
385  *
386  * (If so, we may want to set some dirty flags.)
387  */
388 #define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
389 #define cso_changed_memcmp(x) \
390    (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
391 #define cso_changed_memcmp_elts(x, n) \
392    (!old_cso || memcmp(old_cso->x, new_cso->x, n * sizeof(old_cso->x[0])) != 0)
393 
394 static void
flush_before_state_base_change(struct iris_batch * batch)395 flush_before_state_base_change(struct iris_batch *batch)
396 {
397    /* Wa_14014427904 - We need additional invalidate/flush when
398     * emitting NP state commands with ATS-M in compute mode.
399     */
400    bool atsm_compute = intel_device_info_is_atsm(batch->screen->devinfo) &&
401                        batch->name == IRIS_BATCH_COMPUTE;
402    uint32_t np_state_wa_bits =
403       PIPE_CONTROL_CS_STALL |
404       PIPE_CONTROL_STATE_CACHE_INVALIDATE |
405       PIPE_CONTROL_CONST_CACHE_INVALIDATE |
406       PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
407       PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
408       PIPE_CONTROL_INSTRUCTION_INVALIDATE |
409       PIPE_CONTROL_FLUSH_HDC;
410 
411    /* Flush before emitting STATE_BASE_ADDRESS.
412     *
413     * This isn't documented anywhere in the PRM.  However, it seems to be
414     * necessary prior to changing the surface state base address.  We've
415     * seen issues in Vulkan where we get GPU hangs when using multi-level
416     * command buffers which clear depth, reset state base address, and then
417     * go render stuff.
418     *
419     * Normally, in GL, we would trust the kernel to do sufficient stalls
420     * and flushes prior to executing our batch.  However, it doesn't seem
421     * as if the kernel's flushing is always sufficient and we don't want to
422     * rely on it.
423     *
424     * We make this an end-of-pipe sync instead of a normal flush because we
425     * do not know the current status of the GPU.  On Haswell at least,
426     * having a fast-clear operation in flight at the same time as a normal
427     * rendering operation can cause hangs.  Since the kernel's flushing is
428     * insufficient, we need to ensure that any rendering operations from
429     * other processes are definitely complete before we try to do our own
430     * rendering.  It's a bit of a big hammer but it appears to work.
431     */
432    iris_emit_end_of_pipe_sync(batch,
433                               "change STATE_BASE_ADDRESS (flushes)",
434                               atsm_compute ? np_state_wa_bits : 0 |
435                               PIPE_CONTROL_RENDER_TARGET_FLUSH |
436                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
437                               PIPE_CONTROL_DATA_CACHE_FLUSH);
438 }
439 
440 static void
flush_after_state_base_change(struct iris_batch * batch)441 flush_after_state_base_change(struct iris_batch *batch)
442 {
443    const struct intel_device_info *devinfo = batch->screen->devinfo;
444    /* After re-setting the surface state base address, we have to do some
445     * cache flusing so that the sampler engine will pick up the new
446     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
447     * Shared Function > 3D Sampler > State > State Caching (page 96):
448     *
449     *    Coherency with system memory in the state cache, like the texture
450     *    cache is handled partially by software. It is expected that the
451     *    command stream or shader will issue Cache Flush operation or
452     *    Cache_Flush sampler message to ensure that the L1 cache remains
453     *    coherent with system memory.
454     *
455     *    [...]
456     *
457     *    Whenever the value of the Dynamic_State_Base_Addr,
458     *    Surface_State_Base_Addr are altered, the L1 state cache must be
459     *    invalidated to ensure the new surface or sampler state is fetched
460     *    from system memory.
461     *
462     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
463     * which, according the PIPE_CONTROL instruction documentation in the
464     * Broadwell PRM:
465     *
466     *    Setting this bit is independent of any other bit in this packet.
467     *    This bit controls the invalidation of the L1 and L2 state caches
468     *    at the top of the pipe i.e. at the parsing time.
469     *
470     * Unfortunately, experimentation seems to indicate that state cache
471     * invalidation through a PIPE_CONTROL does nothing whatsoever in
472     * regards to surface state and binding tables.  In stead, it seems that
473     * invalidating the texture cache is what is actually needed.
474     *
475     * XXX:  As far as we have been able to determine through
476     * experimentation, shows that flush the texture cache appears to be
477     * sufficient.  The theory here is that all of the sampling/rendering
478     * units cache the binding table in the texture cache.  However, we have
479     * yet to be able to actually confirm this.
480     *
481     * Wa_16013000631:
482     *
483     *  "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
484     *   or program pipe control with Instruction cache invalidate post
485     *   STATE_BASE_ADDRESS command"
486     */
487    iris_emit_end_of_pipe_sync(batch,
488                               "change STATE_BASE_ADDRESS (invalidates)",
489                               PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
490                               PIPE_CONTROL_CONST_CACHE_INVALIDATE |
491                               PIPE_CONTROL_STATE_CACHE_INVALIDATE |
492                               (intel_needs_workaround(devinfo, 16013000631) ?
493                                PIPE_CONTROL_INSTRUCTION_INVALIDATE : 0));
494 }
495 
496 static void
iris_load_register_reg32(struct iris_batch * batch,uint32_t dst,uint32_t src)497 iris_load_register_reg32(struct iris_batch *batch, uint32_t dst,
498                          uint32_t src)
499 {
500    struct mi_builder b;
501    mi_builder_init(&b, batch->screen->devinfo, batch);
502    mi_store(&b, mi_reg32(dst), mi_reg32(src));
503 }
504 
505 static void
iris_load_register_reg64(struct iris_batch * batch,uint32_t dst,uint32_t src)506 iris_load_register_reg64(struct iris_batch *batch, uint32_t dst,
507                          uint32_t src)
508 {
509    struct mi_builder b;
510    mi_builder_init(&b, batch->screen->devinfo, batch);
511    mi_store(&b, mi_reg64(dst), mi_reg64(src));
512 }
513 
514 static void
iris_load_register_imm32(struct iris_batch * batch,uint32_t reg,uint32_t val)515 iris_load_register_imm32(struct iris_batch *batch, uint32_t reg,
516                          uint32_t val)
517 {
518    struct mi_builder b;
519    mi_builder_init(&b, batch->screen->devinfo, batch);
520    mi_store(&b, mi_reg32(reg), mi_imm(val));
521 }
522 
523 static void
iris_load_register_imm64(struct iris_batch * batch,uint32_t reg,uint64_t val)524 iris_load_register_imm64(struct iris_batch *batch, uint32_t reg,
525                          uint64_t val)
526 {
527    struct mi_builder b;
528    mi_builder_init(&b, batch->screen->devinfo, batch);
529    mi_store(&b, mi_reg64(reg), mi_imm(val));
530 }
531 
532 /**
533  * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
534  */
535 static void
iris_load_register_mem32(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset)536 iris_load_register_mem32(struct iris_batch *batch, uint32_t reg,
537                          struct iris_bo *bo, uint32_t offset)
538 {
539    iris_batch_sync_region_start(batch);
540    struct mi_builder b;
541    mi_builder_init(&b, batch->screen->devinfo, batch);
542    struct mi_value src = mi_mem32(ro_bo(bo, offset));
543    mi_store(&b, mi_reg32(reg), src);
544    iris_batch_sync_region_end(batch);
545 }
546 
547 /**
548  * Load a 64-bit value from a buffer into a MMIO register via
549  * two MI_LOAD_REGISTER_MEM commands.
550  */
551 static void
iris_load_register_mem64(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset)552 iris_load_register_mem64(struct iris_batch *batch, uint32_t reg,
553                          struct iris_bo *bo, uint32_t offset)
554 {
555    iris_batch_sync_region_start(batch);
556    struct mi_builder b;
557    mi_builder_init(&b, batch->screen->devinfo, batch);
558    struct mi_value src = mi_mem64(ro_bo(bo, offset));
559    mi_store(&b, mi_reg64(reg), src);
560    iris_batch_sync_region_end(batch);
561 }
562 
563 static void
iris_store_register_mem32(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset,bool predicated)564 iris_store_register_mem32(struct iris_batch *batch, uint32_t reg,
565                           struct iris_bo *bo, uint32_t offset,
566                           bool predicated)
567 {
568    iris_batch_sync_region_start(batch);
569    struct mi_builder b;
570    mi_builder_init(&b, batch->screen->devinfo, batch);
571    struct mi_value dst = mi_mem32(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
572    struct mi_value src = mi_reg32(reg);
573    if (predicated)
574       mi_store_if(&b, dst, src);
575    else
576       mi_store(&b, dst, src);
577    iris_batch_sync_region_end(batch);
578 }
579 
580 static void
iris_store_register_mem64(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset,bool predicated)581 iris_store_register_mem64(struct iris_batch *batch, uint32_t reg,
582                           struct iris_bo *bo, uint32_t offset,
583                           bool predicated)
584 {
585    iris_batch_sync_region_start(batch);
586    struct mi_builder b;
587    mi_builder_init(&b, batch->screen->devinfo, batch);
588    struct mi_value dst = mi_mem64(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
589    struct mi_value src = mi_reg64(reg);
590    if (predicated)
591       mi_store_if(&b, dst, src);
592    else
593       mi_store(&b, dst, src);
594    iris_batch_sync_region_end(batch);
595 }
596 
597 static void
iris_store_data_imm32(struct iris_batch * batch,struct iris_bo * bo,uint32_t offset,uint32_t imm)598 iris_store_data_imm32(struct iris_batch *batch,
599                       struct iris_bo *bo, uint32_t offset,
600                       uint32_t imm)
601 {
602    iris_batch_sync_region_start(batch);
603    struct mi_builder b;
604    mi_builder_init(&b, batch->screen->devinfo, batch);
605    struct mi_value dst = mi_mem32(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
606    struct mi_value src = mi_imm(imm);
607    mi_store(&b, dst, src);
608    iris_batch_sync_region_end(batch);
609 }
610 
611 static void
iris_store_data_imm64(struct iris_batch * batch,struct iris_bo * bo,uint32_t offset,uint64_t imm)612 iris_store_data_imm64(struct iris_batch *batch,
613                       struct iris_bo *bo, uint32_t offset,
614                       uint64_t imm)
615 {
616    iris_batch_sync_region_start(batch);
617    struct mi_builder b;
618    mi_builder_init(&b, batch->screen->devinfo, batch);
619    struct mi_value dst = mi_mem64(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
620    struct mi_value src = mi_imm(imm);
621    mi_store(&b, dst, src);
622    iris_batch_sync_region_end(batch);
623 }
624 
625 static void
iris_copy_mem_mem(struct iris_batch * batch,struct iris_bo * dst_bo,uint32_t dst_offset,struct iris_bo * src_bo,uint32_t src_offset,unsigned bytes)626 iris_copy_mem_mem(struct iris_batch *batch,
627                   struct iris_bo *dst_bo, uint32_t dst_offset,
628                   struct iris_bo *src_bo, uint32_t src_offset,
629                   unsigned bytes)
630 {
631    /* MI_COPY_MEM_MEM operates on DWords. */
632    assert(bytes % 4 == 0);
633    assert(dst_offset % 4 == 0);
634    assert(src_offset % 4 == 0);
635    iris_batch_sync_region_start(batch);
636 
637    for (unsigned i = 0; i < bytes; i += 4) {
638       iris_emit_cmd(batch, GENX(MI_COPY_MEM_MEM), cp) {
639          cp.DestinationMemoryAddress = rw_bo(dst_bo, dst_offset + i,
640                                              IRIS_DOMAIN_OTHER_WRITE);
641          cp.SourceMemoryAddress = ro_bo(src_bo, src_offset + i);
642       }
643    }
644 
645    iris_batch_sync_region_end(batch);
646 }
647 
648 static void
iris_rewrite_compute_walker_pc(struct iris_batch * batch,uint32_t * walker,struct iris_bo * bo,uint32_t offset)649 iris_rewrite_compute_walker_pc(struct iris_batch *batch,
650                                uint32_t *walker,
651                                struct iris_bo *bo,
652                                uint32_t offset)
653 {
654 #if GFX_VERx10 >= 125
655    struct iris_screen *screen = batch->screen;
656    struct iris_address addr = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
657 
658    uint32_t dwords[GENX(COMPUTE_WALKER_length)];
659 
660    _iris_pack_command(batch, GENX(COMPUTE_WALKER), dwords, cw) {
661       cw.PostSync.Operation          = WriteTimestamp;
662       cw.PostSync.DestinationAddress = addr;
663       cw.PostSync.MOCS               = iris_mocs(NULL, &screen->isl_dev, 0);
664    }
665 
666    for (uint32_t i = 0; i < GENX(COMPUTE_WALKER_length); i++)
667       walker[i] |= dwords[i];
668 #else
669    unreachable("Unsupported");
670 #endif
671 }
672 
673 static void
emit_pipeline_select(struct iris_batch * batch,uint32_t pipeline)674 emit_pipeline_select(struct iris_batch *batch, uint32_t pipeline)
675 {
676    /* Bspec 55860: Xe2+ no longer requires PIPELINE_SELECT */
677 #if GFX_VER < 20
678 
679 #if GFX_VER >= 8 && GFX_VER < 10
680    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
681     *
682     *   Software must clear the COLOR_CALC_STATE Valid field in
683     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
684     *   with Pipeline Select set to GPGPU.
685     *
686     * The internal hardware docs recommend the same workaround for Gfx9
687     * hardware too.
688     */
689    if (pipeline == GPGPU)
690       iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
691 #endif
692 
693 #if GFX_VER >= 12
694    /* From Tigerlake PRM, Volume 2a, PIPELINE_SELECT:
695     *
696     *   "Software must ensure Render Cache, Depth Cache and HDC Pipeline flush
697     *   are flushed through a stalling PIPE_CONTROL command prior to
698     *   programming of PIPELINE_SELECT command transitioning Pipeline Select
699     *   from 3D to GPGPU/Media.
700     *   Software must ensure HDC Pipeline flush and Generic Media State Clear
701     *   is issued through a stalling PIPE_CONTROL command prior to programming
702     *   of PIPELINE_SELECT command transitioning Pipeline Select from
703     *   GPGPU/Media to 3D."
704     *
705     * Note: Issuing PIPE_CONTROL_MEDIA_STATE_CLEAR causes GPU hangs, probably
706     * because PIPE was not in MEDIA mode?!
707     */
708    enum pipe_control_flags flags = PIPE_CONTROL_CS_STALL |
709                                    PIPE_CONTROL_FLUSH_HDC;
710 
711    if (pipeline == GPGPU && batch->name == IRIS_BATCH_RENDER) {
712       flags |= PIPE_CONTROL_RENDER_TARGET_FLUSH |
713                PIPE_CONTROL_DEPTH_CACHE_FLUSH;
714    } else {
715       flags |= PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH;
716    }
717    /* Wa_16013063087 -  State Cache Invalidate must be issued prior to
718     * PIPELINE_SELECT when switching from 3D to Compute.
719     *
720     * SW must do this by programming of PIPECONTROL with “CS Stall” followed
721     * by a PIPECONTROL with State Cache Invalidate bit set.
722     */
723    if (pipeline == GPGPU &&
724        intel_needs_workaround(batch->screen->devinfo, 16013063087))
725       flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
726 
727    iris_emit_pipe_control_flush(batch, "PIPELINE_SELECT flush", flags);
728 #else
729    /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
730     * PIPELINE_SELECT [DevBWR+]":
731     *
732     *    "Project: DEVSNB+
733     *
734     *     Software must ensure all the write caches are flushed through a
735     *     stalling PIPE_CONTROL command followed by another PIPE_CONTROL
736     *     command to invalidate read only caches prior to programming
737     *     MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
738     */
739     iris_emit_pipe_control_flush(batch,
740                                  "workaround: PIPELINE_SELECT flushes (1/2)",
741                                  PIPE_CONTROL_RENDER_TARGET_FLUSH |
742                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
743                                  PIPE_CONTROL_DATA_CACHE_FLUSH |
744                                  PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
745                                  PIPE_CONTROL_CS_STALL);
746 
747     iris_emit_pipe_control_flush(batch,
748                                  "workaround: PIPELINE_SELECT flushes (2/2)",
749                                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
750                                  PIPE_CONTROL_CONST_CACHE_INVALIDATE |
751                                  PIPE_CONTROL_STATE_CACHE_INVALIDATE |
752                                  PIPE_CONTROL_INSTRUCTION_INVALIDATE);
753 #endif
754 
755    iris_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
756 #if GFX_VER >= 9
757       sel.MaskBits = GFX_VER == 12 ? 0x13 : 0x3;
758 #if GFX_VER == 12
759       sel.MediaSamplerDOPClockGateEnable = true;
760 #endif /* if GFX_VER == 12 */
761 #endif /* if GFX_VER >= 9 */
762       sel.PipelineSelection = pipeline;
763    }
764 #endif /* if GFX_VER < 20 */
765 }
766 
767 UNUSED static void
init_glk_barrier_mode(struct iris_batch * batch,uint32_t value)768 init_glk_barrier_mode(struct iris_batch *batch, uint32_t value)
769 {
770 #if GFX_VER == 9
771    /* Project: DevGLK
772     *
773     *    "This chicken bit works around a hardware issue with barrier
774     *     logic encountered when switching between GPGPU and 3D pipelines.
775     *     To workaround the issue, this mode bit should be set after a
776     *     pipeline is selected."
777     */
778    iris_emit_reg(batch, GENX(SLICE_COMMON_ECO_CHICKEN1), reg) {
779       reg.GLKBarrierMode = value;
780       reg.GLKBarrierModeMask = 1;
781    }
782 #endif
783 }
784 
785 static void
init_state_base_address(struct iris_batch * batch)786 init_state_base_address(struct iris_batch *batch)
787 {
788    struct isl_device *isl_dev = &batch->screen->isl_dev;
789    uint32_t mocs = isl_mocs(isl_dev, 0, false);
790    flush_before_state_base_change(batch);
791 
792    /* We program most base addresses once at context initialization time.
793     * Each base address points at a 4GB memory zone, and never needs to
794     * change.  See iris_bufmgr.h for a description of the memory zones.
795     *
796     * The one exception is Surface State Base Address, which needs to be
797     * updated occasionally.  See iris_binder.c for the details there.
798     */
799    iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
800       sba.GeneralStateMOCS            = mocs;
801       sba.StatelessDataPortAccessMOCS = mocs;
802       sba.DynamicStateMOCS            = mocs;
803       sba.IndirectObjectMOCS          = mocs;
804       sba.InstructionMOCS             = mocs;
805       sba.SurfaceStateMOCS            = mocs;
806 #if GFX_VER >= 9
807       sba.BindlessSurfaceStateMOCS    = mocs;
808 #endif
809 
810       sba.GeneralStateBaseAddressModifyEnable   = true;
811       sba.DynamicStateBaseAddressModifyEnable   = true;
812       sba.IndirectObjectBaseAddressModifyEnable = true;
813       sba.InstructionBaseAddressModifyEnable    = true;
814       sba.GeneralStateBufferSizeModifyEnable    = true;
815       sba.DynamicStateBufferSizeModifyEnable    = true;
816       sba.SurfaceStateBaseAddressModifyEnable   = true;
817 #if GFX_VER >= 11
818       sba.BindlessSamplerStateMOCS    = mocs;
819 #endif
820       sba.IndirectObjectBufferSizeModifyEnable  = true;
821       sba.InstructionBuffersizeModifyEnable     = true;
822 
823       sba.InstructionBaseAddress  = ro_bo(NULL, IRIS_MEMZONE_SHADER_START);
824       sba.DynamicStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_DYNAMIC_START);
825       sba.SurfaceStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_BINDER_START);
826 
827       sba.GeneralStateBufferSize   = 0xfffff;
828       sba.IndirectObjectBufferSize = 0xfffff;
829       sba.InstructionBufferSize    = 0xfffff;
830       sba.DynamicStateBufferSize   = 0xfffff;
831 #if GFX_VERx10 >= 125
832       sba.L1CacheControl = L1CC_WB;
833 #endif
834    }
835 
836    flush_after_state_base_change(batch);
837 }
838 
839 static void
iris_emit_l3_config(struct iris_batch * batch,const struct intel_l3_config * cfg)840 iris_emit_l3_config(struct iris_batch *batch,
841                     const struct intel_l3_config *cfg)
842 {
843 #if GFX_VER < 20
844    assert(cfg || GFX_VER >= 12);
845 
846 #if GFX_VER >= 12
847 #define L3_ALLOCATION_REG GENX(L3ALLOC)
848 #define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)
849 #else
850 #define L3_ALLOCATION_REG GENX(L3CNTLREG)
851 #define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)
852 #endif
853 
854    iris_emit_reg(batch, L3_ALLOCATION_REG, reg) {
855 #if GFX_VER < 11
856       reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0;
857 #endif
858 #if GFX_VER == 11
859       /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be set
860        * in L3CNTLREG register. The default setting of the bit is not the
861        * desirable behavior.
862        */
863       reg.ErrorDetectionBehaviorControl = true;
864       reg.UseFullWays = true;
865 #endif
866       if (GFX_VER < 12 || (cfg && cfg->n[INTEL_L3P_ALL] <= 126)) {
867          reg.URBAllocation = cfg->n[INTEL_L3P_URB];
868          reg.ROAllocation = cfg->n[INTEL_L3P_RO];
869          reg.DCAllocation = cfg->n[INTEL_L3P_DC];
870          reg.AllAllocation = cfg->n[INTEL_L3P_ALL];
871       } else {
872          assert(!cfg || !(cfg->n[INTEL_L3P_SLM] || cfg->n[INTEL_L3P_URB] ||
873                           cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_RO] ||
874                           cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_C] ||
875                           cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_TC]));
876 #if GFX_VER >= 12
877          reg.L3FullWayAllocationEnable = true;
878 #endif
879       }
880    }
881 #endif /* GFX_VER < 20 */
882 }
883 
884 void
genX(emit_urb_config)885 genX(emit_urb_config)(struct iris_batch *batch,
886                       bool has_tess_eval,
887                       bool has_geometry)
888 {
889    struct iris_screen *screen = batch->screen;
890    struct iris_context *ice = batch->ice;
891 
892    intel_get_urb_config(screen->devinfo,
893                         screen->l3_config_3d,
894                         has_tess_eval,
895                         has_geometry,
896                         &ice->shaders.urb.cfg,
897                         &ice->state.urb_deref_block_size,
898                         &ice->shaders.urb.constrained);
899 
900    genX(urb_workaround)(batch, &ice->shaders.urb.cfg);
901 
902    for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
903       iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
904          urb._3DCommandSubOpcode += i;
905          urb.VSURBStartingAddress     = ice->shaders.urb.cfg.start[i];
906          urb.VSURBEntryAllocationSize = ice->shaders.urb.cfg.size[i] - 1;
907          urb.VSNumberofURBEntries     = ice->shaders.urb.cfg.entries[i];
908       }
909    }
910 }
911 
912 #if GFX_VER == 9
913 static void
iris_enable_obj_preemption(struct iris_batch * batch,bool enable)914 iris_enable_obj_preemption(struct iris_batch *batch, bool enable)
915 {
916    /* A fixed function pipe flush is required before modifying this field */
917    iris_emit_end_of_pipe_sync(batch, enable ? "enable preemption"
918                                             : "disable preemption",
919                               PIPE_CONTROL_RENDER_TARGET_FLUSH);
920 
921    /* enable object level preemption */
922    iris_emit_reg(batch, GENX(CS_CHICKEN1), reg) {
923       reg.ReplayMode = enable;
924       reg.ReplayModeMask = true;
925    }
926 }
927 #endif
928 
929 static void
upload_pixel_hashing_tables(struct iris_batch * batch)930 upload_pixel_hashing_tables(struct iris_batch *batch)
931 {
932    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
933    UNUSED struct iris_context *ice = batch->ice;
934    assert(&ice->batches[IRIS_BATCH_RENDER] == batch);
935 
936 #if GFX_VER == 11
937    /* Gfx11 hardware has two pixel pipes at most. */
938    for (unsigned i = 2; i < ARRAY_SIZE(devinfo->ppipe_subslices); i++)
939       assert(devinfo->ppipe_subslices[i] == 0);
940 
941    if (devinfo->ppipe_subslices[0] == devinfo->ppipe_subslices[1])
942       return;
943 
944    unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
945    uint32_t hash_address;
946    struct pipe_resource *tmp = NULL;
947    uint32_t *map =
948       stream_state(batch, ice->state.dynamic_uploader, &tmp,
949                    size, 64, &hash_address);
950    pipe_resource_reference(&tmp, NULL);
951 
952    const bool flip = devinfo->ppipe_subslices[0] < devinfo->ppipe_subslices[1];
953    struct GENX(SLICE_HASH_TABLE) table;
954    intel_compute_pixel_hash_table_3way(16, 16, 3, 3, flip, table.Entry[0]);
955 
956    GENX(SLICE_HASH_TABLE_pack)(NULL, map, &table);
957 
958    iris_emit_cmd(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
959       ptr.SliceHashStatePointerValid = true;
960       ptr.SliceHashTableStatePointer = hash_address;
961    }
962 
963    iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {
964       mode.SliceHashingTableEnable = true;
965    }
966 
967 #elif GFX_VERx10 == 120
968    /* For each n calculate ppipes_of[n], equal to the number of pixel pipes
969     * present with n active dual subslices.
970     */
971    unsigned ppipes_of[3] = {};
972 
973    for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {
974       for (unsigned p = 0; p < 3; p++)
975          ppipes_of[n] += (devinfo->ppipe_subslices[p] == n);
976    }
977 
978    /* Gfx12 has three pixel pipes. */
979    for (unsigned p = 3; p < ARRAY_SIZE(devinfo->ppipe_subslices); p++)
980       assert(devinfo->ppipe_subslices[p] == 0);
981 
982    if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {
983       /* All three pixel pipes have the maximum number of active dual
984        * subslices, or there is only one active pixel pipe: Nothing to do.
985        */
986       return;
987    }
988 
989    iris_emit_cmd(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {
990       p.SliceHashControl[0] = TABLE_0;
991 
992       if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
993          intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);
994       else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
995          intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);
996 
997       if (ppipes_of[2] == 2 && ppipes_of[1] == 1)
998          intel_compute_pixel_hash_table_3way(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);
999       else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
1000          intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);
1001       else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
1002          intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);
1003       else
1004          unreachable("Illegal fusing.");
1005    }
1006 
1007    iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), p) {
1008       p.SubsliceHashingTableEnable = true;
1009       p.SubsliceHashingTableEnableMask = true;
1010    }
1011 
1012 #elif GFX_VERx10 == 125
1013    struct pipe_screen *pscreen = &batch->screen->base;
1014    const unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
1015    const struct pipe_resource tmpl = {
1016      .target = PIPE_BUFFER,
1017      .format = PIPE_FORMAT_R8_UNORM,
1018      .bind = PIPE_BIND_CUSTOM,
1019      .usage = PIPE_USAGE_IMMUTABLE,
1020      .flags = IRIS_RESOURCE_FLAG_DYNAMIC_MEMZONE,
1021      .width0 = size,
1022      .height0 = 1,
1023      .depth0 = 1,
1024      .array_size = 1
1025    };
1026 
1027    pipe_resource_reference(&ice->state.pixel_hashing_tables, NULL);
1028    ice->state.pixel_hashing_tables = pscreen->resource_create(pscreen, &tmpl);
1029 
1030    struct iris_resource *res = (struct iris_resource *)ice->state.pixel_hashing_tables;
1031    struct pipe_transfer *transfer = NULL;
1032    uint32_t *map = pipe_buffer_map_range(&ice->ctx, ice->state.pixel_hashing_tables,
1033                                          0, size, PIPE_MAP_WRITE,
1034                                          &transfer);
1035 
1036    /* Calculate the set of present pixel pipes, and another set of
1037     * present pixel pipes with 2 dual subslices enabled, the latter
1038     * will appear on the hashing table with twice the frequency of
1039     * pixel pipes with a single dual subslice present.
1040     */
1041    uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0;
1042    for (unsigned p = 0; p < ARRAY_SIZE(devinfo->ppipe_subslices); p++) {
1043       if (devinfo->ppipe_subslices[p])
1044          ppipe_mask1 |= (1u << p);
1045       if (devinfo->ppipe_subslices[p] > 1)
1046          ppipe_mask2 |= (1u << p);
1047    }
1048    assert(ppipe_mask1);
1049 
1050    struct GENX(SLICE_HASH_TABLE) table;
1051 
1052    /* Note that the hardware expects an array with 7 tables, each
1053     * table is intended to specify the pixel pipe hashing behavior for
1054     * every possible slice count between 2 and 8, however that doesn't
1055     * actually work, among other reasons due to hardware bugs that
1056     * will cause the GPU to erroneously access the table at the wrong
1057     * index in some cases, so in practice all 7 tables need to be
1058     * initialized to the same value.
1059     */
1060    for (unsigned i = 0; i < 7; i++)
1061       intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2,
1062                                           table.Entry[i][0]);
1063 
1064    GENX(SLICE_HASH_TABLE_pack)(NULL, map, &table);
1065 
1066    pipe_buffer_unmap(&ice->ctx, transfer);
1067 
1068    iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_NONE);
1069    iris_record_state_size(batch->state_sizes, res->bo->address + res->offset, size);
1070 
1071    iris_emit_cmd(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
1072       ptr.SliceHashStatePointerValid = true;
1073       ptr.SliceHashTableStatePointer = iris_bo_offset_from_base_address(res->bo) +
1074                                        res->offset;
1075    }
1076 
1077    iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {
1078       mode.SliceHashingTableEnable = true;
1079       mode.SliceHashingTableEnableMask = true;
1080       mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ?
1081                                     hashing32x32 : NormalMode);
1082       mode.CrossSliceHashingModeMask = -1;
1083    }
1084 #endif
1085 }
1086 
1087 static void
iris_alloc_push_constants(struct iris_batch * batch)1088 iris_alloc_push_constants(struct iris_batch *batch)
1089 {
1090    const struct intel_device_info *devinfo = batch->screen->devinfo;
1091 
1092    /* For now, we set a static partitioning of the push constant area,
1093     * assuming that all stages could be in use.
1094     *
1095     * TODO: Try lazily allocating the HS/DS/GS sections as needed, and
1096     *       see if that improves performance by offering more space to
1097     *       the VS/FS when those aren't in use.  Also, try dynamically
1098     *       enabling/disabling it like i965 does.  This would be more
1099     *       stalls and may not actually help; we don't know yet.
1100     */
1101 
1102    /* Divide as equally as possible with any remainder given to FRAGMENT. */
1103    const unsigned push_constant_kb = devinfo->max_constant_urb_size_kb;
1104    const unsigned stage_size = push_constant_kb / 5;
1105    const unsigned frag_size = push_constant_kb - 4 * stage_size;
1106 
1107    for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
1108       iris_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
1109          alloc._3DCommandSubOpcode = 18 + i;
1110          alloc.ConstantBufferOffset = stage_size * i;
1111          alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? frag_size : stage_size;
1112       }
1113    }
1114 
1115 #if GFX_VERx10 == 125
1116    /* DG2: Wa_22011440098
1117     * MTL: Wa_18022330953
1118     *
1119     * In 3D mode, after programming push constant alloc command immediately
1120     * program push constant command(ZERO length) without any commit between
1121     * them.
1122     */
1123    iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_ALL), c) {
1124       /* Update empty push constants for all stages (bitmask = 11111b) */
1125       c.ShaderUpdateEnable = 0x1f;
1126       c.MOCS = iris_mocs(NULL, &batch->screen->isl_dev, 0);
1127    }
1128 #endif
1129 }
1130 
1131 #if GFX_VER >= 12
1132 static void
1133 init_aux_map_state(struct iris_batch *batch);
1134 #endif
1135 
1136 /* This updates a register. Caller should stall the pipeline as needed. */
1137 static void
iris_disable_rhwo_optimization(struct iris_batch * batch,bool disable)1138 iris_disable_rhwo_optimization(struct iris_batch *batch, bool disable)
1139 {
1140    assert(batch->screen->devinfo->verx10 == 120);
1141 #if GFX_VERx10 == 120
1142    iris_emit_reg(batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
1143       c1.RCCRHWOOptimizationDisable = disable;
1144       c1.RCCRHWOOptimizationDisableMask = true;
1145    };
1146 #endif
1147 }
1148 
1149 /**
1150  * Upload initial GPU state for any kind of context.
1151  *
1152  * These need to happen for both render and compute.
1153  */
1154 static void
iris_init_common_context(struct iris_batch * batch)1155 iris_init_common_context(struct iris_batch *batch)
1156 {
1157 #if GFX_VER == 11
1158    iris_emit_reg(batch, GENX(SAMPLER_MODE), reg) {
1159       reg.HeaderlessMessageforPreemptableContexts = 1;
1160       reg.HeaderlessMessageforPreemptableContextsMask = 1;
1161    }
1162 
1163    /* Bit 1 must be set in HALF_SLICE_CHICKEN7. */
1164    iris_emit_reg(batch, GENX(HALF_SLICE_CHICKEN7), reg) {
1165       reg.EnabledTexelOffsetPrecisionFix = 1;
1166       reg.EnabledTexelOffsetPrecisionFixMask = 1;
1167    }
1168 #endif
1169 
1170    /* Select 256B-aligned binding table mode on Icelake through Tigerlake,
1171     * which gives us larger binding table pointers, at the cost of higher
1172     * alignment requirements (bits 18:8 are valid instead of 15:5).  When
1173     * using this mode, we have to shift binding table pointers by 3 bits,
1174     * as they're still stored in the same bit-location in the field.
1175     */
1176 #if GFX_VER >= 11 && GFX_VERx10 < 125
1177    iris_emit_reg(batch, GENX(GT_MODE), reg) {
1178       reg.BindingTableAlignment = BTP_18_8;
1179       reg.BindingTableAlignmentMask = true;
1180    }
1181 #endif
1182 
1183 #if GFX_VERx10 == 125
1184    /* Even though L3 partial write merging is supposed to be enabled
1185     * by default on Gfx12.5 according to the hardware spec, i915
1186     * appears to accidentally clear the enables during context
1187     * initialization, so make sure to enable them here since partial
1188     * write merging has a large impact on rendering performance.
1189     */
1190    iris_emit_reg(batch, GENX(L3SQCREG5), reg) {
1191       reg.L3CachePartialWriteMergeTimerInitialValue = 0x7f;
1192       reg.CompressiblePartialWriteMergeEnable = true;
1193       reg.CoherentPartialWriteMergeEnable = true;
1194       reg.CrossTilePartialWriteMergeEnable = true;
1195    }
1196 #endif
1197 }
1198 
1199 static void
toggle_protected(struct iris_batch * batch)1200 toggle_protected(struct iris_batch *batch)
1201 {
1202    struct iris_context *ice;
1203 
1204    if (batch->name == IRIS_BATCH_RENDER)
1205       ice =container_of(batch, struct iris_context, batches[IRIS_BATCH_RENDER]);
1206    else if (batch->name == IRIS_BATCH_COMPUTE)
1207       ice = container_of(batch, struct iris_context, batches[IRIS_BATCH_COMPUTE]);
1208    else
1209       unreachable("unhandled batch");
1210 
1211    if (!ice->protected)
1212       return;
1213 
1214 #if GFX_VER >= 12
1215    iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
1216       pc.CommandStreamerStallEnable = true;
1217       pc.RenderTargetCacheFlushEnable = true;
1218       pc.ProtectedMemoryDisable = true;
1219    }
1220    iris_emit_cmd(batch, GENX(MI_SET_APPID), appid) {
1221       /* Default value for single session. */
1222       appid.ProtectedMemoryApplicationID = 0xf;
1223       appid.ProtectedMemoryApplicationIDType = DISPLAY_APP;
1224    }
1225    iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
1226       pc.CommandStreamerStallEnable = true;
1227       pc.RenderTargetCacheFlushEnable = true;
1228       pc.ProtectedMemoryEnable = true;
1229    }
1230 #else
1231    unreachable("Not supported");
1232 #endif
1233 }
1234 
1235 #if GFX_VER >= 20
1236 #define _3DSTATE_DRAWING_RECTANGLE GENX(3DSTATE_DRAWING_RECTANGLE_FAST)
1237 #else
1238 #define _3DSTATE_DRAWING_RECTANGLE GENX(3DSTATE_DRAWING_RECTANGLE)
1239 #endif
1240 
1241 /**
1242  * Upload the initial GPU state for a render context.
1243  *
1244  * This sets some invariant state that needs to be programmed a particular
1245  * way, but we never actually change.
1246  */
1247 static void
iris_init_render_context(struct iris_batch * batch)1248 iris_init_render_context(struct iris_batch *batch)
1249 {
1250    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
1251 
1252    iris_batch_sync_region_start(batch);
1253 
1254    emit_pipeline_select(batch, _3D);
1255 
1256    toggle_protected(batch);
1257 
1258    iris_emit_l3_config(batch, batch->screen->l3_config_3d);
1259 
1260    init_state_base_address(batch);
1261 
1262    iris_init_common_context(batch);
1263 
1264 #if GFX_VER >= 9
1265    iris_emit_reg(batch, GENX(CS_DEBUG_MODE2), reg) {
1266       reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1267       reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1268    }
1269 #else
1270    iris_emit_reg(batch, GENX(INSTPM), reg) {
1271       reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1272       reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1273    }
1274 #endif
1275 
1276 #if GFX_VER == 9
1277    iris_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
1278       reg.FloatBlendOptimizationEnable = true;
1279       reg.FloatBlendOptimizationEnableMask = true;
1280       reg.MSCRAWHazardAvoidanceBit = true;
1281       reg.MSCRAWHazardAvoidanceBitMask = true;
1282       reg.PartialResolveDisableInVC = true;
1283       reg.PartialResolveDisableInVCMask = true;
1284    }
1285 
1286    if (devinfo->platform == INTEL_PLATFORM_GLK)
1287       init_glk_barrier_mode(batch, GLK_BARRIER_MODE_3D_HULL);
1288 #endif
1289 
1290 #if GFX_VER == 11
1291    iris_emit_reg(batch, GENX(TCCNTLREG), reg) {
1292       reg.L3DataPartialWriteMergingEnable = true;
1293       reg.ColorZPartialWriteMergingEnable = true;
1294       reg.URBPartialWriteMergingEnable = true;
1295       reg.TCDisable = true;
1296    }
1297 
1298    /* Hardware specification recommends disabling repacking for the
1299     * compatibility with decompression mechanism in display controller.
1300     */
1301    if (devinfo->disable_ccs_repack) {
1302       iris_emit_reg(batch, GENX(CACHE_MODE_0), reg) {
1303          reg.DisableRepackingforCompression = true;
1304          reg.DisableRepackingforCompressionMask = true;
1305       }
1306    }
1307 #endif
1308 
1309 #if GFX_VER == 12
1310    iris_emit_reg(batch, GENX(FF_MODE2), reg) {
1311       /* On Alchemist, the FF_MODE2 docs for the GS timer say:
1312        *
1313        *    "The timer value must be set to 224."
1314        *
1315        * and Wa_16011163337 indicates this is the case for all Gfx12 parts,
1316        * and that this is necessary to avoid hanging the HS/DS units.  It
1317        * also clarifies that 224 is literally 0xE0 in the bits, not 7*32=224.
1318        *
1319        * The HS timer docs also have the same quote for Alchemist.  I am
1320        * unaware of a reason it needs to be set to 224 on Tigerlake, but
1321        * we do so for consistency if nothing else.
1322        *
1323        * For the TDS timer value, the docs say:
1324        *
1325        *    "For best performance, a value of 4 should be programmed."
1326        *
1327        * i915 also sets it this way on Tigerlake due to workarounds.
1328        *
1329        * The default VS timer appears to be 0, so we leave it at that.
1330        */
1331       reg.GSTimerValue  = 224;
1332       reg.HSTimerValue  = 224;
1333       reg.TDSTimerValue = 4;
1334       reg.VSTimerValue  = 0;
1335    }
1336 #endif
1337 
1338 #if INTEL_NEEDS_WA_1508744258
1339    /* The suggested workaround is:
1340     *
1341     *    Disable RHWO by setting 0x7010[14] by default except during resolve
1342     *    pass.
1343     *
1344     * We implement global disabling of the optimization here and we toggle it
1345     * in iris_resolve_color.
1346     *
1347     * iris_init_compute_context is unmodified because we don't expect to
1348     * access the RCC in the compute context. iris_mcs_partial_resolve is
1349     * unmodified because that pass doesn't use a HW bit to perform the
1350     * resolve (related HSDs specifically call out the RenderTargetResolveType
1351     * field in the 3DSTATE_PS instruction).
1352     */
1353    iris_disable_rhwo_optimization(batch, true);
1354 #endif
1355 
1356 #if GFX_VERx10 == 120
1357    /* Wa_1806527549 says to disable the following HiZ optimization when the
1358     * depth buffer is D16_UNORM. We've found the WA to help with more depth
1359     * buffer configurations however, so we always disable it just to be safe.
1360     */
1361    iris_emit_reg(batch, GENX(HIZ_CHICKEN), reg) {
1362       reg.HZDepthTestLEGEOptimizationDisable = true;
1363       reg.HZDepthTestLEGEOptimizationDisableMask = true;
1364    }
1365 #endif
1366 
1367 #if GFX_VERx10 == 125
1368    iris_emit_reg(batch, GENX(CHICKEN_RASTER_2), reg) {
1369       reg.TBIMRBatchSizeOverride = true;
1370       reg.TBIMROpenBatchEnable = true;
1371       reg.TBIMRFastClip = true;
1372       reg.TBIMRBatchSizeOverrideMask = true;
1373       reg.TBIMROpenBatchEnableMask = true;
1374       reg.TBIMRFastClipMask = true;
1375    };
1376 #endif
1377 
1378 #if GFX_VER >= 20
1379    iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), p) {
1380       p.DX10OGLBorderModeforYCRCB = true;
1381       p.DX10OGLBorderModeforYCRCBMask = true;
1382    }
1383 #endif
1384 
1385    upload_pixel_hashing_tables(batch);
1386 
1387    /* 3DSTATE_DRAWING_RECTANGLE is non-pipelined, so we want to avoid
1388     * changing it dynamically.  We set it to the maximum size here, and
1389     * instead include the render target dimensions in the viewport, so
1390     * viewport extents clipping takes care of pruning stray geometry.
1391     */
1392    iris_emit_cmd(batch, _3DSTATE_DRAWING_RECTANGLE, rect) {
1393       rect.ClippedDrawingRectangleXMax = UINT16_MAX;
1394       rect.ClippedDrawingRectangleYMax = UINT16_MAX;
1395    }
1396 
1397    /* Set the initial MSAA sample positions. */
1398    iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) {
1399       INTEL_SAMPLE_POS_1X(pat._1xSample);
1400       INTEL_SAMPLE_POS_2X(pat._2xSample);
1401       INTEL_SAMPLE_POS_4X(pat._4xSample);
1402       INTEL_SAMPLE_POS_8X(pat._8xSample);
1403 #if GFX_VER >= 9
1404       INTEL_SAMPLE_POS_16X(pat._16xSample);
1405 #endif
1406    }
1407 
1408    /* Use the legacy AA line coverage computation. */
1409    iris_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
1410 
1411    /* Disable chromakeying (it's for media) */
1412    iris_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo);
1413 
1414    /* We want regular rendering, not special HiZ operations. */
1415    iris_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);
1416 
1417    /* No polygon stippling offsets are necessary. */
1418    /* TODO: may need to set an offset for origin-UL framebuffers */
1419    iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
1420 
1421 #if GFX_VERx10 >= 125
1422    iris_emit_cmd(batch, GENX(3DSTATE_MESH_CONTROL), foo);
1423    iris_emit_cmd(batch, GENX(3DSTATE_TASK_CONTROL), foo);
1424 #endif
1425 
1426 #if INTEL_NEEDS_WA_14019857787
1427    iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), p) {
1428       p.EnableOOOreadsinRCPB = true;
1429       p.EnableOOOreadsinRCPBMask = true;
1430    }
1431 #endif
1432 
1433    iris_alloc_push_constants(batch);
1434 
1435 #if GFX_VER >= 12
1436    init_aux_map_state(batch);
1437 #endif
1438 
1439    iris_batch_sync_region_end(batch);
1440 }
1441 
1442 static void
iris_init_compute_context(struct iris_batch * batch)1443 iris_init_compute_context(struct iris_batch *batch)
1444 {
1445    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
1446 
1447    iris_batch_sync_region_start(batch);
1448 
1449    /* Wa_1607854226:
1450     *
1451     *  Start with pipeline in 3D mode to set the STATE_BASE_ADDRESS.
1452     */
1453 #if GFX_VERx10 == 120
1454    emit_pipeline_select(batch, _3D);
1455 #else
1456    emit_pipeline_select(batch, GPGPU);
1457 #endif
1458 
1459    toggle_protected(batch);
1460 
1461    iris_emit_l3_config(batch, batch->screen->l3_config_cs);
1462 
1463    init_state_base_address(batch);
1464 
1465    iris_init_common_context(batch);
1466 
1467 #if GFX_VERx10 == 120
1468    emit_pipeline_select(batch, GPGPU);
1469 #endif
1470 
1471 #if GFX_VER == 9
1472    if (devinfo->platform == INTEL_PLATFORM_GLK)
1473       init_glk_barrier_mode(batch, GLK_BARRIER_MODE_GPGPU);
1474 #endif
1475 
1476 #if GFX_VER >= 12
1477    init_aux_map_state(batch);
1478 #endif
1479 
1480 #if GFX_VERx10 >= 125
1481    iris_emit_cmd(batch, GENX(CFE_STATE), cfe) {
1482       cfe.MaximumNumberofThreads =
1483          devinfo->max_cs_threads * devinfo->subslice_total;
1484    }
1485 #endif
1486 
1487    iris_batch_sync_region_end(batch);
1488 }
1489 
1490 static void
iris_init_copy_context(struct iris_batch * batch)1491 iris_init_copy_context(struct iris_batch *batch)
1492 {
1493    iris_batch_sync_region_start(batch);
1494 
1495 #if GFX_VER >= 12
1496    init_aux_map_state(batch);
1497 #endif
1498 
1499    iris_batch_sync_region_end(batch);
1500 }
1501 
1502 struct iris_vertex_buffer_state {
1503    /** The VERTEX_BUFFER_STATE hardware structure. */
1504    uint32_t state[GENX(VERTEX_BUFFER_STATE_length)];
1505 
1506    /** The resource to source vertex data from. */
1507    struct pipe_resource *resource;
1508 
1509    int offset;
1510 };
1511 
1512 struct iris_depth_buffer_state {
1513    /* Depth/HiZ/Stencil related hardware packets. */
1514 #if GFX_VER < 20
1515    uint32_t packets[GENX(3DSTATE_DEPTH_BUFFER_length) +
1516                     GENX(3DSTATE_STENCIL_BUFFER_length) +
1517                     GENX(3DSTATE_HIER_DEPTH_BUFFER_length) +
1518                     GENX(3DSTATE_CLEAR_PARAMS_length)];
1519 #else
1520    uint32_t packets[GENX(3DSTATE_DEPTH_BUFFER_length) +
1521                     GENX(3DSTATE_STENCIL_BUFFER_length) +
1522                     GENX(3DSTATE_HIER_DEPTH_BUFFER_length)];
1523 #endif
1524 };
1525 
1526 #if INTEL_NEEDS_WA_1808121037
1527 enum iris_depth_reg_mode {
1528    IRIS_DEPTH_REG_MODE_HW_DEFAULT = 0,
1529    IRIS_DEPTH_REG_MODE_D16_1X_MSAA,
1530    IRIS_DEPTH_REG_MODE_UNKNOWN,
1531 };
1532 #endif
1533 
1534 /**
1535  * Generation-specific context state (ice->state.genx->...).
1536  *
1537  * Most state can go in iris_context directly, but these encode hardware
1538  * packets which vary by generation.
1539  */
1540 struct iris_genx_state {
1541    struct iris_vertex_buffer_state vertex_buffers[33];
1542    uint32_t last_index_buffer[GENX(3DSTATE_INDEX_BUFFER_length)];
1543 
1544    struct iris_depth_buffer_state depth_buffer;
1545 
1546    uint32_t so_buffers[4 * GENX(3DSTATE_SO_BUFFER_length)];
1547 
1548 #if GFX_VER == 8
1549    bool pma_fix_enabled;
1550 #endif
1551 
1552    /* Is object level preemption enabled? */
1553    bool object_preemption;
1554 
1555 #if INTEL_NEEDS_WA_1808121037
1556    enum iris_depth_reg_mode depth_reg_mode;
1557 #endif
1558 
1559    struct {
1560 #if GFX_VER == 8
1561       struct isl_image_param image_param[PIPE_MAX_SHADER_IMAGES];
1562 #endif
1563    } shaders[MESA_SHADER_STAGES];
1564 };
1565 
1566 /**
1567  * The pipe->set_blend_color() driver hook.
1568  *
1569  * This corresponds to our COLOR_CALC_STATE.
1570  */
1571 static void
iris_set_blend_color(struct pipe_context * ctx,const struct pipe_blend_color * state)1572 iris_set_blend_color(struct pipe_context *ctx,
1573                      const struct pipe_blend_color *state)
1574 {
1575    struct iris_context *ice = (struct iris_context *) ctx;
1576 
1577    /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
1578    memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
1579    ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
1580 }
1581 
1582 /**
1583  * Gallium CSO for blend state (see pipe_blend_state).
1584  */
1585 struct iris_blend_state {
1586    /** Partial 3DSTATE_PS_BLEND */
1587    uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];
1588 
1589    /** Partial BLEND_STATE */
1590    uint32_t blend_state[GENX(BLEND_STATE_length) +
1591                         IRIS_MAX_DRAW_BUFFERS * GENX(BLEND_STATE_ENTRY_length)];
1592 
1593    bool alpha_to_coverage; /* for shader key */
1594 
1595    /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
1596    uint8_t blend_enables;
1597 
1598    /** Bitfield of whether color writes are enabled for RT[i] */
1599    uint8_t color_write_enables;
1600 
1601    /** Does RT[0] use dual color blending? */
1602    bool dual_color_blending;
1603 
1604    int ps_dst_blend_factor[IRIS_MAX_DRAW_BUFFERS];
1605    int ps_dst_alpha_blend_factor[IRIS_MAX_DRAW_BUFFERS];
1606 };
1607 
1608 static enum pipe_blendfactor
fix_blendfactor(enum pipe_blendfactor f,bool alpha_to_one)1609 fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
1610 {
1611    if (alpha_to_one) {
1612       if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
1613          return PIPE_BLENDFACTOR_ONE;
1614 
1615       if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
1616          return PIPE_BLENDFACTOR_ZERO;
1617    }
1618 
1619    return f;
1620 }
1621 
1622 /**
1623  * The pipe->create_blend_state() driver hook.
1624  *
1625  * Translates a pipe_blend_state into iris_blend_state.
1626  */
1627 static void *
iris_create_blend_state(struct pipe_context * ctx,const struct pipe_blend_state * state)1628 iris_create_blend_state(struct pipe_context *ctx,
1629                         const struct pipe_blend_state *state)
1630 {
1631    struct iris_blend_state *cso = malloc(sizeof(struct iris_blend_state));
1632    uint32_t *blend_entry = cso->blend_state + GENX(BLEND_STATE_length);
1633 
1634    cso->blend_enables = 0;
1635    cso->color_write_enables = 0;
1636    STATIC_ASSERT(IRIS_MAX_DRAW_BUFFERS <= 8);
1637 
1638    cso->alpha_to_coverage = state->alpha_to_coverage;
1639 
1640    bool indep_alpha_blend = false;
1641 
1642    for (int i = 0; i < IRIS_MAX_DRAW_BUFFERS; i++) {
1643       const struct pipe_rt_blend_state *rt =
1644          &state->rt[state->independent_blend_enable ? i : 0];
1645 
1646       enum pipe_blendfactor src_rgb =
1647          fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one);
1648       enum pipe_blendfactor src_alpha =
1649          fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one);
1650       enum pipe_blendfactor dst_rgb =
1651          fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one);
1652       enum pipe_blendfactor dst_alpha =
1653          fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one);
1654 
1655       /* Stored separately in cso for dynamic emission. */
1656       cso->ps_dst_blend_factor[i] = (int) dst_rgb;
1657       cso->ps_dst_alpha_blend_factor[i] = (int) dst_alpha;
1658 
1659       if (rt->rgb_func != rt->alpha_func ||
1660           src_rgb != src_alpha || dst_rgb != dst_alpha)
1661          indep_alpha_blend = true;
1662 
1663       if (rt->blend_enable)
1664          cso->blend_enables |= 1u << i;
1665 
1666       if (rt->colormask)
1667          cso->color_write_enables |= 1u << i;
1668 
1669       iris_pack_state(GENX(BLEND_STATE_ENTRY), blend_entry, be) {
1670          be.LogicOpEnable = state->logicop_enable;
1671          be.LogicOpFunction = state->logicop_func;
1672 
1673          be.PreBlendSourceOnlyClampEnable = false;
1674          be.ColorClampRange = COLORCLAMP_RTFORMAT;
1675          be.PreBlendColorClampEnable = true;
1676          be.PostBlendColorClampEnable = true;
1677 
1678          be.ColorBufferBlendEnable = rt->blend_enable;
1679 
1680          be.ColorBlendFunction          = rt->rgb_func;
1681          be.AlphaBlendFunction          = rt->alpha_func;
1682 
1683          /* The casts prevent warnings about implicit enum type conversions. */
1684          be.SourceBlendFactor           = (int) src_rgb;
1685          be.SourceAlphaBlendFactor      = (int) src_alpha;
1686 
1687          be.WriteDisableRed   = !(rt->colormask & PIPE_MASK_R);
1688          be.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
1689          be.WriteDisableBlue  = !(rt->colormask & PIPE_MASK_B);
1690          be.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
1691       }
1692       blend_entry += GENX(BLEND_STATE_ENTRY_length);
1693    }
1694 
1695    iris_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) {
1696       /* pb.HasWriteableRT is filled in at draw time.
1697        * pb.AlphaTestEnable is filled in at draw time.
1698        *
1699        * pb.ColorBufferBlendEnable is filled in at draw time so we can avoid
1700        * setting it when dual color blending without an appropriate shader.
1701        */
1702 
1703       pb.AlphaToCoverageEnable = state->alpha_to_coverage;
1704       pb.IndependentAlphaBlendEnable = indep_alpha_blend;
1705 
1706       /* The casts prevent warnings about implicit enum type conversions. */
1707       pb.SourceBlendFactor =
1708          (int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one);
1709       pb.SourceAlphaBlendFactor =
1710          (int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one);
1711    }
1712 
1713    iris_pack_state(GENX(BLEND_STATE), cso->blend_state, bs) {
1714       bs.AlphaToCoverageEnable = state->alpha_to_coverage;
1715       bs.IndependentAlphaBlendEnable = indep_alpha_blend;
1716       bs.AlphaToOneEnable = state->alpha_to_one;
1717       bs.AlphaToCoverageDitherEnable = state->alpha_to_coverage_dither;
1718       bs.ColorDitherEnable = state->dither;
1719       /* bl.AlphaTestEnable and bs.AlphaTestFunction are filled in later. */
1720    }
1721 
1722    cso->dual_color_blending = util_blend_state_is_dual(state, 0);
1723 
1724    return cso;
1725 }
1726 
1727 /**
1728  * The pipe->bind_blend_state() driver hook.
1729  *
1730  * Bind a blending CSO and flag related dirty bits.
1731  */
1732 static void
iris_bind_blend_state(struct pipe_context * ctx,void * state)1733 iris_bind_blend_state(struct pipe_context *ctx, void *state)
1734 {
1735    struct iris_context *ice = (struct iris_context *) ctx;
1736    struct iris_blend_state *cso = state;
1737 
1738    ice->state.cso_blend = cso;
1739 
1740    ice->state.dirty |= IRIS_DIRTY_PS_BLEND;
1741    ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
1742    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[IRIS_NOS_BLEND];
1743 
1744    if (GFX_VER == 8)
1745       ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
1746 }
1747 
1748 /**
1749  * Return true if the FS writes to any color outputs which are not disabled
1750  * via color masking.
1751  */
1752 static bool
has_writeable_rt(const struct iris_blend_state * cso_blend,const struct shader_info * fs_info)1753 has_writeable_rt(const struct iris_blend_state *cso_blend,
1754                  const struct shader_info *fs_info)
1755 {
1756    if (!fs_info)
1757       return false;
1758 
1759    unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
1760 
1761    if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
1762       rt_outputs = (1 << IRIS_MAX_DRAW_BUFFERS) - 1;
1763 
1764    return cso_blend->color_write_enables & rt_outputs;
1765 }
1766 
1767 /**
1768  * Gallium CSO for depth, stencil, and alpha testing state.
1769  */
1770 struct iris_depth_stencil_alpha_state {
1771    /** Partial 3DSTATE_WM_DEPTH_STENCIL. */
1772    uint32_t wmds[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
1773 
1774 #if GFX_VER >= 12
1775    uint32_t depth_bounds[GENX(3DSTATE_DEPTH_BOUNDS_length)];
1776 #endif
1777 
1778    /** Outbound to BLEND_STATE, 3DSTATE_PS_BLEND, COLOR_CALC_STATE. */
1779    unsigned alpha_enabled:1;
1780    unsigned alpha_func:3;     /**< PIPE_FUNC_x */
1781    float alpha_ref_value;     /**< reference value */
1782 
1783    /** Outbound to resolve and cache set tracking. */
1784    bool depth_writes_enabled;
1785    bool stencil_writes_enabled;
1786 
1787    /** Outbound to Gfx8-9 PMA stall equations */
1788    bool depth_test_enabled;
1789 
1790    /** Tracking state of DS writes for Wa_18019816803. */
1791    bool ds_write_state;
1792 };
1793 
1794 /**
1795  * The pipe->create_depth_stencil_alpha_state() driver hook.
1796  *
1797  * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
1798  * testing state since we need pieces of it in a variety of places.
1799  */
1800 static void *
iris_create_zsa_state(struct pipe_context * ctx,const struct pipe_depth_stencil_alpha_state * state)1801 iris_create_zsa_state(struct pipe_context *ctx,
1802                       const struct pipe_depth_stencil_alpha_state *state)
1803 {
1804    struct iris_depth_stencil_alpha_state *cso =
1805       malloc(sizeof(struct iris_depth_stencil_alpha_state));
1806 
1807    bool two_sided_stencil = state->stencil[1].enabled;
1808 
1809    bool depth_write_enabled = false;
1810    bool stencil_write_enabled = false;
1811 
1812    /* Depth writes enabled? */
1813    if (state->depth_writemask &&
1814       ((!state->depth_enabled) ||
1815       ((state->depth_func != PIPE_FUNC_NEVER) &&
1816         (state->depth_func != PIPE_FUNC_EQUAL))))
1817       depth_write_enabled = true;
1818 
1819    bool stencil_all_keep =
1820       state->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
1821       state->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
1822       state->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP &&
1823       (!two_sided_stencil ||
1824        (state->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP &&
1825         state->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP &&
1826         state->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP));
1827 
1828    bool stencil_mask_zero =
1829       state->stencil[0].writemask == 0 ||
1830       (!two_sided_stencil || state->stencil[1].writemask  == 0);
1831 
1832    bool stencil_func_never =
1833       state->stencil[0].func == PIPE_FUNC_NEVER &&
1834       state->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
1835       (!two_sided_stencil ||
1836        (state->stencil[1].func == PIPE_FUNC_NEVER &&
1837         state->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP));
1838 
1839    /* Stencil writes enabled? */
1840    if (state->stencil[0].writemask != 0 ||
1841       ((two_sided_stencil && state->stencil[1].writemask != 0) &&
1842        (!stencil_all_keep &&
1843         !stencil_mask_zero &&
1844         !stencil_func_never)))
1845       stencil_write_enabled = true;
1846 
1847    cso->ds_write_state = depth_write_enabled || stencil_write_enabled;
1848 
1849    cso->alpha_enabled = state->alpha_enabled;
1850    cso->alpha_func = state->alpha_func;
1851    cso->alpha_ref_value = state->alpha_ref_value;
1852    cso->depth_writes_enabled = state->depth_writemask;
1853    cso->depth_test_enabled = state->depth_enabled;
1854    cso->stencil_writes_enabled =
1855       state->stencil[0].writemask != 0 ||
1856       (two_sided_stencil && state->stencil[1].writemask != 0);
1857 
1858    /* gallium frontends need to optimize away EQUAL writes for us. */
1859    assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
1860 
1861    iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), cso->wmds, wmds) {
1862       wmds.StencilFailOp = state->stencil[0].fail_op;
1863       wmds.StencilPassDepthFailOp = state->stencil[0].zfail_op;
1864       wmds.StencilPassDepthPassOp = state->stencil[0].zpass_op;
1865       wmds.StencilTestFunction =
1866          translate_compare_func(state->stencil[0].func);
1867       wmds.BackfaceStencilFailOp = state->stencil[1].fail_op;
1868       wmds.BackfaceStencilPassDepthFailOp = state->stencil[1].zfail_op;
1869       wmds.BackfaceStencilPassDepthPassOp = state->stencil[1].zpass_op;
1870       wmds.BackfaceStencilTestFunction =
1871          translate_compare_func(state->stencil[1].func);
1872       wmds.DepthTestFunction = translate_compare_func(state->depth_func);
1873       wmds.DoubleSidedStencilEnable = two_sided_stencil;
1874       wmds.StencilTestEnable = state->stencil[0].enabled;
1875       wmds.StencilBufferWriteEnable =
1876          state->stencil[0].writemask != 0 ||
1877          (two_sided_stencil && state->stencil[1].writemask != 0);
1878       wmds.DepthTestEnable = state->depth_enabled;
1879       wmds.DepthBufferWriteEnable = state->depth_writemask;
1880       wmds.StencilTestMask = state->stencil[0].valuemask;
1881       wmds.StencilWriteMask = state->stencil[0].writemask;
1882       wmds.BackfaceStencilTestMask = state->stencil[1].valuemask;
1883       wmds.BackfaceStencilWriteMask = state->stencil[1].writemask;
1884       /* wmds.[Backface]StencilReferenceValue are merged later */
1885 #if GFX_VER >= 12
1886       wmds.StencilReferenceValueModifyDisable = true;
1887 #endif
1888    }
1889 
1890 #if GFX_VER >= 12
1891    iris_pack_command(GENX(3DSTATE_DEPTH_BOUNDS), cso->depth_bounds, depth_bounds) {
1892       depth_bounds.DepthBoundsTestValueModifyDisable = false;
1893       depth_bounds.DepthBoundsTestEnableModifyDisable = false;
1894       depth_bounds.DepthBoundsTestEnable = state->depth_bounds_test;
1895       depth_bounds.DepthBoundsTestMinValue = state->depth_bounds_min;
1896       depth_bounds.DepthBoundsTestMaxValue = state->depth_bounds_max;
1897    }
1898 #endif
1899 
1900    return cso;
1901 }
1902 
1903 /**
1904  * The pipe->bind_depth_stencil_alpha_state() driver hook.
1905  *
1906  * Bind a depth/stencil/alpha CSO and flag related dirty bits.
1907  */
1908 static void
iris_bind_zsa_state(struct pipe_context * ctx,void * state)1909 iris_bind_zsa_state(struct pipe_context *ctx, void *state)
1910 {
1911    struct iris_context *ice = (struct iris_context *) ctx;
1912    struct iris_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
1913    struct iris_depth_stencil_alpha_state *new_cso = state;
1914 
1915    if (new_cso) {
1916       if (cso_changed(alpha_ref_value))
1917          ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
1918 
1919       if (cso_changed(alpha_enabled))
1920          ice->state.dirty |= IRIS_DIRTY_PS_BLEND | IRIS_DIRTY_BLEND_STATE;
1921 
1922       if (cso_changed(alpha_func))
1923          ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
1924 
1925       if (cso_changed(depth_writes_enabled) || cso_changed(stencil_writes_enabled))
1926          ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1927 
1928       ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
1929       ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
1930 
1931       /* State ds_write_enable changed, need to flag dirty DS. */
1932       if (!old_cso || (ice->state.ds_write_state != new_cso->ds_write_state)) {
1933          ice->state.dirty |= IRIS_DIRTY_DS_WRITE_ENABLE;
1934          ice->state.ds_write_state = new_cso->ds_write_state;
1935       }
1936 
1937 #if GFX_VER >= 12
1938       if (cso_changed(depth_bounds))
1939          ice->state.dirty |= IRIS_DIRTY_DEPTH_BOUNDS;
1940 #endif
1941    }
1942 
1943    ice->state.cso_zsa = new_cso;
1944    ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
1945    ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL;
1946    ice->state.stage_dirty |=
1947       ice->state.stage_dirty_for_nos[IRIS_NOS_DEPTH_STENCIL_ALPHA];
1948 
1949    if (GFX_VER == 8)
1950       ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
1951 }
1952 
1953 #if GFX_VER == 8
1954 static bool
want_pma_fix(struct iris_context * ice)1955 want_pma_fix(struct iris_context *ice)
1956 {
1957    UNUSED struct iris_screen *screen = (void *) ice->ctx.screen;
1958    UNUSED const struct intel_device_info *devinfo = screen->devinfo;
1959    const struct iris_fs_data *fs_data =
1960       iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
1961    const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
1962    const struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
1963    const struct iris_blend_state *cso_blend = ice->state.cso_blend;
1964 
1965    /* In very specific combinations of state, we can instruct Gfx8-9 hardware
1966     * to avoid stalling at the pixel mask array.  The state equations are
1967     * documented in these places:
1968     *
1969     * - Gfx8 Depth PMA Fix:   CACHE_MODE_1::NP_PMA_FIX_ENABLE
1970     * - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable
1971     *
1972     * Both equations share some common elements:
1973     *
1974     *    no_hiz_op =
1975     *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1976     *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1977     *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1978     *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
1979     *
1980     *    killpixels =
1981     *       3DSTATE_WM::ForceKillPix != ForceOff &&
1982     *       (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1983     *        3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1984     *        3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1985     *        3DSTATE_PS_BLEND::AlphaTestEnable ||
1986     *        3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1987     *
1988     *    (Technically the stencil PMA treats ForceKillPix differently,
1989     *     but I think this is a documentation oversight, and we don't
1990     *     ever use it in this way, so it doesn't matter).
1991     *
1992     *    common_pma_fix =
1993     *       3DSTATE_WM::ForceThreadDispatch != 1 &&
1994     *       3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&
1995     *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1996     *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1997     *       3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&
1998     *       3DSTATE_PS_EXTRA::PixelShaderValid &&
1999     *       no_hiz_op
2000     *
2001     * These are always true:
2002     *
2003     *    3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0
2004     *    3DSTATE_PS_EXTRA::PixelShaderValid
2005     *
2006     * Also, we never use the normal drawing path for HiZ ops; these are true:
2007     *
2008     *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
2009     *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
2010     *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
2011     *      3DSTATE_WM_HZ_OP::StencilBufferClear)
2012     *
2013     * This happens sometimes:
2014     *
2015     *    3DSTATE_WM::ForceThreadDispatch != 1
2016     *
2017     * However, we choose to ignore it as it either agrees with the signal
2018     * (dispatch was already enabled, so nothing out of the ordinary), or
2019     * there are no framebuffer attachments (so no depth or HiZ anyway,
2020     * meaning the PMA signal will already be disabled).
2021     */
2022 
2023    if (!cso_fb->zsbuf)
2024       return false;
2025 
2026    struct iris_resource *zres, *sres;
2027    iris_get_depth_stencil_resources(cso_fb->zsbuf->texture, &zres, &sres);
2028 
2029    /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
2030     * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
2031     */
2032    if (!zres ||
2033        !iris_resource_level_has_hiz(devinfo, zres, cso_fb->zsbuf->u.tex.level))
2034       return false;
2035 
2036    /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */
2037    if (fs_data->early_fragment_tests)
2038       return false;
2039 
2040    /* 3DSTATE_WM::ForceKillPix != ForceOff &&
2041     * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
2042     *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
2043     *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
2044     *  3DSTATE_PS_BLEND::AlphaTestEnable ||
2045     *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
2046     */
2047    bool killpixels = fs_data->uses_kill || fs_data->uses_omask ||
2048                      cso_blend->alpha_to_coverage || cso_zsa->alpha_enabled;
2049 
2050    /* The Gfx8 depth PMA equation becomes:
2051     *
2052     *    depth_writes =
2053     *       3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
2054     *       3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE
2055     *
2056     *    stencil_writes =
2057     *       3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
2058     *       3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
2059     *       3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
2060     *
2061     *    Z_PMA_OPT =
2062     *       common_pma_fix &&
2063     *       3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&
2064     *       ((killpixels && (depth_writes || stencil_writes)) ||
2065     *        3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)
2066     *
2067     */
2068    if (!cso_zsa->depth_test_enabled)
2069       return false;
2070 
2071    return fs_data->computed_depth_mode != PSCDEPTH_OFF ||
2072           (killpixels && (cso_zsa->depth_writes_enabled ||
2073                           (sres && cso_zsa->stencil_writes_enabled)));
2074 }
2075 #endif
2076 
2077 void
genX(update_pma_fix)2078 genX(update_pma_fix)(struct iris_context *ice,
2079                      struct iris_batch *batch,
2080                      bool enable)
2081 {
2082 #if GFX_VER == 8
2083    struct iris_genx_state *genx = ice->state.genx;
2084 
2085    if (genx->pma_fix_enabled == enable)
2086       return;
2087 
2088    genx->pma_fix_enabled = enable;
2089 
2090    /* According to the Broadwell PIPE_CONTROL documentation, software should
2091     * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
2092     * prior to the LRI.  If stencil buffer writes are enabled, then a Render        * Cache Flush is also necessary.
2093     *
2094     * The Gfx9 docs say to use a depth stall rather than a command streamer
2095     * stall.  However, the hardware seems to violently disagree.  A full
2096     * command streamer stall seems to be needed in both cases.
2097     */
2098    iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
2099                                 PIPE_CONTROL_CS_STALL |
2100                                 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2101                                 PIPE_CONTROL_RENDER_TARGET_FLUSH);
2102 
2103    iris_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
2104       reg.NPPMAFixEnable = enable;
2105       reg.NPEarlyZFailsDisable = enable;
2106       reg.NPPMAFixEnableMask = true;
2107       reg.NPEarlyZFailsDisableMask = true;
2108    }
2109 
2110    /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
2111     * Flush bits is often necessary.  We do it regardless because it's easier.
2112     * The render cache flush is also necessary if stencil writes are enabled.
2113     *
2114     * Again, the Gfx9 docs give a different set of flushes but the Broadwell
2115     * flushes seem to work just as well.
2116     */
2117    iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
2118                                 PIPE_CONTROL_DEPTH_STALL |
2119                                 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2120                                 PIPE_CONTROL_RENDER_TARGET_FLUSH);
2121 #endif
2122 }
2123 
2124 /**
2125  * Gallium CSO for rasterizer state.
2126  */
2127 struct iris_rasterizer_state {
2128    uint32_t sf[GENX(3DSTATE_SF_length)];
2129    uint32_t clip[GENX(3DSTATE_CLIP_length)];
2130    uint32_t raster[GENX(3DSTATE_RASTER_length)];
2131    uint32_t wm[GENX(3DSTATE_WM_length)];
2132    uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
2133 
2134    uint8_t num_clip_plane_consts;
2135    bool clip_halfz; /* for CC_VIEWPORT */
2136    bool depth_clip_near; /* for CC_VIEWPORT */
2137    bool depth_clip_far; /* for CC_VIEWPORT */
2138    bool flatshade; /* for shader state */
2139    bool flatshade_first; /* for stream output */
2140    bool clamp_fragment_color; /* for shader state */
2141    bool light_twoside; /* for shader state */
2142    bool rasterizer_discard; /* for 3DSTATE_STREAMOUT and 3DSTATE_CLIP */
2143    bool half_pixel_center; /* for 3DSTATE_MULTISAMPLE */
2144    bool line_smooth;
2145    bool line_stipple_enable;
2146    bool poly_stipple_enable;
2147    bool multisample;
2148    bool force_persample_interp;
2149    bool conservative_rasterization;
2150    bool fill_mode_point;
2151    bool fill_mode_line;
2152    bool fill_mode_point_or_line;
2153    enum pipe_sprite_coord_mode sprite_coord_mode; /* PIPE_SPRITE_* */
2154    uint16_t sprite_coord_enable;
2155 };
2156 
2157 static float
get_line_width(const struct pipe_rasterizer_state * state)2158 get_line_width(const struct pipe_rasterizer_state *state)
2159 {
2160    float line_width = state->line_width;
2161 
2162    /* From the OpenGL 4.4 spec:
2163     *
2164     * "The actual width of non-antialiased lines is determined by rounding
2165     *  the supplied width to the nearest integer, then clamping it to the
2166     *  implementation-dependent maximum non-antialiased line width."
2167     */
2168    if (!state->multisample && !state->line_smooth)
2169       line_width = roundf(state->line_width);
2170 
2171    if (!state->multisample && state->line_smooth && line_width < 1.5f) {
2172       /* For 1 pixel line thickness or less, the general anti-aliasing
2173        * algorithm gives up, and a garbage line is generated.  Setting a
2174        * Line Width of 0.0 specifies the rasterization of the "thinnest"
2175        * (one-pixel-wide), non-antialiased lines.
2176        *
2177        * Lines rendered with zero Line Width are rasterized using the
2178        * "Grid Intersection Quantization" rules as specified by the
2179        * "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
2180        */
2181       line_width = 0.0f;
2182    }
2183 
2184    return line_width;
2185 }
2186 
2187 /**
2188  * The pipe->create_rasterizer_state() driver hook.
2189  */
2190 static void *
iris_create_rasterizer_state(struct pipe_context * ctx,const struct pipe_rasterizer_state * state)2191 iris_create_rasterizer_state(struct pipe_context *ctx,
2192                              const struct pipe_rasterizer_state *state)
2193 {
2194    struct iris_rasterizer_state *cso =
2195       malloc(sizeof(struct iris_rasterizer_state));
2196 
2197    cso->multisample = state->multisample;
2198    cso->force_persample_interp = state->force_persample_interp;
2199    cso->clip_halfz = state->clip_halfz;
2200    cso->depth_clip_near = state->depth_clip_near;
2201    cso->depth_clip_far = state->depth_clip_far;
2202    cso->flatshade = state->flatshade;
2203    cso->flatshade_first = state->flatshade_first;
2204    cso->clamp_fragment_color = state->clamp_fragment_color;
2205    cso->light_twoside = state->light_twoside;
2206    cso->rasterizer_discard = state->rasterizer_discard;
2207    cso->half_pixel_center = state->half_pixel_center;
2208    cso->sprite_coord_mode = state->sprite_coord_mode;
2209    cso->sprite_coord_enable = state->sprite_coord_enable;
2210    cso->line_smooth = state->line_smooth;
2211    cso->line_stipple_enable = state->line_stipple_enable;
2212    cso->poly_stipple_enable = state->poly_stipple_enable;
2213    cso->conservative_rasterization =
2214       state->conservative_raster_mode == PIPE_CONSERVATIVE_RASTER_POST_SNAP;
2215 
2216    cso->fill_mode_point =
2217       state->fill_front == PIPE_POLYGON_MODE_POINT ||
2218       state->fill_back == PIPE_POLYGON_MODE_POINT;
2219    cso->fill_mode_line =
2220       state->fill_front == PIPE_POLYGON_MODE_LINE ||
2221       state->fill_back == PIPE_POLYGON_MODE_LINE;
2222    cso->fill_mode_point_or_line =
2223       cso->fill_mode_point ||
2224       cso->fill_mode_line;
2225 
2226    if (state->clip_plane_enable != 0)
2227       cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
2228    else
2229       cso->num_clip_plane_consts = 0;
2230 
2231    float line_width = get_line_width(state);
2232 
2233    iris_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
2234       sf.StatisticsEnable = true;
2235       sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
2236       sf.LineEndCapAntialiasingRegionWidth =
2237          state->line_smooth ? _10pixels : _05pixels;
2238       sf.LastPixelEnable = state->line_last_pixel;
2239       sf.LineWidth = line_width;
2240       sf.SmoothPointEnable = (state->point_smooth || state->multisample) &&
2241                              !state->point_quad_rasterization;
2242       sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
2243       sf.PointWidth = CLAMP(state->point_size, 0.125f, 255.875f);
2244 
2245       if (state->flatshade_first) {
2246          sf.TriangleFanProvokingVertexSelect = 1;
2247       } else {
2248          sf.TriangleStripListProvokingVertexSelect = 2;
2249          sf.TriangleFanProvokingVertexSelect = 2;
2250          sf.LineStripListProvokingVertexSelect = 1;
2251       }
2252    }
2253 
2254    iris_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {
2255       rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;
2256       rr.CullMode = translate_cull_mode(state->cull_face);
2257       rr.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2258       rr.BackFaceFillMode = translate_fill_mode(state->fill_back);
2259       rr.DXMultisampleRasterizationEnable = state->multisample;
2260       rr.GlobalDepthOffsetEnableSolid = state->offset_tri;
2261       rr.GlobalDepthOffsetEnableWireframe = state->offset_line;
2262       rr.GlobalDepthOffsetEnablePoint = state->offset_point;
2263       rr.GlobalDepthOffsetConstant = state->offset_units * 2;
2264       rr.GlobalDepthOffsetScale = state->offset_scale;
2265       rr.GlobalDepthOffsetClamp = state->offset_clamp;
2266       rr.SmoothPointEnable = state->point_smooth;
2267       rr.ScissorRectangleEnable = state->scissor;
2268 #if GFX_VER >= 9
2269       rr.ViewportZNearClipTestEnable = state->depth_clip_near;
2270       rr.ViewportZFarClipTestEnable = state->depth_clip_far;
2271       rr.ConservativeRasterizationEnable =
2272          cso->conservative_rasterization;
2273 #else
2274       rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2275 #endif
2276    }
2277 
2278    iris_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
2279       /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
2280        * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
2281        */
2282       cl.EarlyCullEnable = true;
2283       cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
2284       cl.ForceUserClipDistanceClipTestEnableBitmask = true;
2285       cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
2286       cl.GuardbandClipTestEnable = true;
2287       cl.ClipEnable = true;
2288       cl.MinimumPointWidth = 0.125;
2289       cl.MaximumPointWidth = 255.875;
2290 
2291       if (state->flatshade_first) {
2292          cl.TriangleFanProvokingVertexSelect = 1;
2293       } else {
2294          cl.TriangleStripListProvokingVertexSelect = 2;
2295          cl.TriangleFanProvokingVertexSelect = 2;
2296          cl.LineStripListProvokingVertexSelect = 1;
2297       }
2298    }
2299 
2300    iris_pack_command(GENX(3DSTATE_WM), cso->wm, wm) {
2301       /* wm.BarycentricInterpolationMode and wm.EarlyDepthStencilControl are
2302        * filled in at draw time from the FS program.
2303        */
2304       wm.LineAntialiasingRegionWidth = _10pixels;
2305       wm.LineEndCapAntialiasingRegionWidth = _05pixels;
2306       wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
2307       wm.LineStippleEnable = state->line_stipple_enable;
2308       wm.PolygonStippleEnable = state->poly_stipple_enable;
2309    }
2310 
2311    /* Remap from 0..255 back to 1..256 */
2312    const unsigned line_stipple_factor = state->line_stipple_factor + 1;
2313 
2314    iris_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
2315       if (state->line_stipple_enable) {
2316          line.LineStipplePattern = state->line_stipple_pattern;
2317          line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
2318          line.LineStippleRepeatCount = line_stipple_factor;
2319       }
2320    }
2321 
2322    return cso;
2323 }
2324 
2325 /**
2326  * The pipe->bind_rasterizer_state() driver hook.
2327  *
2328  * Bind a rasterizer CSO and flag related dirty bits.
2329  */
2330 static void
iris_bind_rasterizer_state(struct pipe_context * ctx,void * state)2331 iris_bind_rasterizer_state(struct pipe_context *ctx, void *state)
2332 {
2333    struct iris_context *ice = (struct iris_context *) ctx;
2334    struct iris_rasterizer_state *old_cso = ice->state.cso_rast;
2335    struct iris_rasterizer_state *new_cso = state;
2336 
2337    if (new_cso) {
2338       /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
2339       if (cso_changed_memcmp(line_stipple))
2340          ice->state.dirty |= IRIS_DIRTY_LINE_STIPPLE;
2341 
2342       if (cso_changed(half_pixel_center))
2343          ice->state.dirty |= IRIS_DIRTY_MULTISAMPLE;
2344 
2345       if (cso_changed(line_stipple_enable) || cso_changed(poly_stipple_enable))
2346          ice->state.dirty |= IRIS_DIRTY_WM;
2347 
2348       if (cso_changed(rasterizer_discard))
2349          ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
2350 
2351       if (cso_changed(flatshade_first))
2352          ice->state.dirty |= IRIS_DIRTY_STREAMOUT;
2353 
2354       if (cso_changed(depth_clip_near) || cso_changed(depth_clip_far) ||
2355           cso_changed(clip_halfz))
2356          ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
2357 
2358       if (cso_changed(sprite_coord_enable) ||
2359           cso_changed(sprite_coord_mode) ||
2360           cso_changed(light_twoside))
2361          ice->state.dirty |= IRIS_DIRTY_SBE;
2362 
2363       if (cso_changed(conservative_rasterization))
2364          ice->state.stage_dirty |= IRIS_STAGE_DIRTY_FS;
2365    }
2366 
2367    ice->state.cso_rast = new_cso;
2368    ice->state.dirty |= IRIS_DIRTY_RASTER;
2369    ice->state.dirty |= IRIS_DIRTY_CLIP;
2370    ice->state.stage_dirty |=
2371       ice->state.stage_dirty_for_nos[IRIS_NOS_RASTERIZER];
2372 }
2373 
2374 /**
2375  * Return true if the given wrap mode requires the border color to exist.
2376  *
2377  * (We can skip uploading it if the sampler isn't going to use it.)
2378  */
2379 static bool
wrap_mode_needs_border_color(unsigned wrap_mode)2380 wrap_mode_needs_border_color(unsigned wrap_mode)
2381 {
2382    return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER;
2383 }
2384 
2385 /**
2386  * Gallium CSO for sampler state.
2387  */
2388 struct iris_sampler_state {
2389    union pipe_color_union border_color;
2390    bool needs_border_color;
2391 
2392    uint32_t sampler_state[GENX(SAMPLER_STATE_length)];
2393 
2394 #if GFX_VERx10 == 125
2395    /* Sampler state structure to use for 3D textures in order to
2396     * implement Wa_14014414195.
2397     */
2398    uint32_t sampler_state_3d[GENX(SAMPLER_STATE_length)];
2399 #endif
2400 };
2401 
2402 static void
fill_sampler_state(uint32_t * sampler_state,const struct pipe_sampler_state * state,unsigned max_anisotropy)2403 fill_sampler_state(uint32_t *sampler_state,
2404                    const struct pipe_sampler_state *state,
2405                    unsigned max_anisotropy)
2406 {
2407    float min_lod = state->min_lod;
2408    unsigned mag_img_filter = state->mag_img_filter;
2409 
2410    // XXX: explain this code ported from ilo...I don't get it at all...
2411    if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
2412        state->min_lod > 0.0f) {
2413       min_lod = 0.0f;
2414       mag_img_filter = state->min_img_filter;
2415    }
2416 
2417    iris_pack_state(GENX(SAMPLER_STATE), sampler_state, samp) {
2418       samp.TCXAddressControlMode = translate_wrap(state->wrap_s);
2419       samp.TCYAddressControlMode = translate_wrap(state->wrap_t);
2420       samp.TCZAddressControlMode = translate_wrap(state->wrap_r);
2421       samp.CubeSurfaceControlMode = state->seamless_cube_map;
2422       samp.NonnormalizedCoordinateEnable = state->unnormalized_coords;
2423       samp.MinModeFilter = state->min_img_filter;
2424       samp.MagModeFilter = mag_img_filter;
2425       samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
2426       samp.MaximumAnisotropy = RATIO21;
2427 
2428       if (max_anisotropy >= 2) {
2429          if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
2430             samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
2431             samp.AnisotropicAlgorithm = EWAApproximation;
2432          }
2433 
2434          if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)
2435             samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
2436 
2437          samp.MaximumAnisotropy =
2438             MIN2((max_anisotropy - 2) / 2, RATIO161);
2439       }
2440 
2441       /* Set address rounding bits if not using nearest filtering. */
2442       if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
2443          samp.UAddressMinFilterRoundingEnable = true;
2444          samp.VAddressMinFilterRoundingEnable = true;
2445          samp.RAddressMinFilterRoundingEnable = true;
2446       }
2447 
2448       if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
2449          samp.UAddressMagFilterRoundingEnable = true;
2450          samp.VAddressMagFilterRoundingEnable = true;
2451          samp.RAddressMagFilterRoundingEnable = true;
2452       }
2453 
2454       if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
2455          samp.ShadowFunction = translate_shadow_func(state->compare_func);
2456 
2457       const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
2458 
2459       samp.LODPreClampMode = CLAMP_MODE_OGL;
2460       samp.MinLOD = CLAMP(min_lod, 0, hw_max_lod);
2461       samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
2462       samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
2463 
2464       /* .BorderColorPointer is filled in by iris_bind_sampler_states. */
2465    }
2466 }
2467 
2468 /**
2469  * The pipe->create_sampler_state() driver hook.
2470  *
2471  * We fill out SAMPLER_STATE (except for the border color pointer), and
2472  * store that on the CPU.  It doesn't make sense to upload it to a GPU
2473  * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
2474  * all bound sampler states to be in contiguous memor.
2475  */
2476 static void *
iris_create_sampler_state(struct pipe_context * ctx,const struct pipe_sampler_state * state)2477 iris_create_sampler_state(struct pipe_context *ctx,
2478                           const struct pipe_sampler_state *state)
2479 {
2480    UNUSED struct iris_screen *screen = (void *)ctx->screen;
2481    UNUSED const struct intel_device_info *devinfo = screen->devinfo;
2482    struct iris_sampler_state *cso = CALLOC_STRUCT(iris_sampler_state);
2483 
2484    if (!cso)
2485       return NULL;
2486 
2487    STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
2488    STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
2489 
2490    unsigned wrap_s = translate_wrap(state->wrap_s);
2491    unsigned wrap_t = translate_wrap(state->wrap_t);
2492    unsigned wrap_r = translate_wrap(state->wrap_r);
2493 
2494    memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
2495 
2496    cso->needs_border_color = wrap_mode_needs_border_color(wrap_s) ||
2497                              wrap_mode_needs_border_color(wrap_t) ||
2498                              wrap_mode_needs_border_color(wrap_r);
2499 
2500    fill_sampler_state(cso->sampler_state, state, state->max_anisotropy);
2501 
2502 #if GFX_VERx10 == 125
2503    /* Fill an extra sampler state structure with anisotropic filtering
2504     * disabled used to implement Wa_14014414195.
2505     */
2506    if (intel_needs_workaround(screen->devinfo, 14014414195))
2507       fill_sampler_state(cso->sampler_state_3d, state, 0);
2508 #endif
2509 
2510    return cso;
2511 }
2512 
2513 /**
2514  * The pipe->bind_sampler_states() driver hook.
2515  */
2516 static void
iris_bind_sampler_states(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,void ** states)2517 iris_bind_sampler_states(struct pipe_context *ctx,
2518                          enum pipe_shader_type p_stage,
2519                          unsigned start, unsigned count,
2520                          void **states)
2521 {
2522    struct iris_context *ice = (struct iris_context *) ctx;
2523    gl_shader_stage stage = stage_from_pipe(p_stage);
2524    struct iris_shader_state *shs = &ice->state.shaders[stage];
2525 
2526    assert(start + count <= IRIS_MAX_SAMPLERS);
2527 
2528    bool dirty = false;
2529 
2530    for (int i = 0; i < count; i++) {
2531       struct iris_sampler_state *state = states ? states[i] : NULL;
2532       if (shs->samplers[start + i] != state) {
2533          shs->samplers[start + i] = state;
2534          dirty = true;
2535       }
2536    }
2537 
2538    if (dirty)
2539       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
2540 }
2541 
2542 /**
2543  * Upload the sampler states into a contiguous area of GPU memory, for
2544  * for 3DSTATE_SAMPLER_STATE_POINTERS_*.
2545  *
2546  * Also fill out the border color state pointers.
2547  */
2548 static void
iris_upload_sampler_states(struct iris_context * ice,gl_shader_stage stage)2549 iris_upload_sampler_states(struct iris_context *ice, gl_shader_stage stage)
2550 {
2551    struct iris_screen *screen = (struct iris_screen *) ice->ctx.screen;
2552    struct iris_compiled_shader *shader = ice->shaders.prog[stage];
2553    struct iris_shader_state *shs = &ice->state.shaders[stage];
2554    struct iris_border_color_pool *border_color_pool =
2555       iris_bufmgr_get_border_color_pool(screen->bufmgr);
2556 
2557    /* We assume gallium frontends will call pipe->bind_sampler_states()
2558     * if the program's number of textures changes.
2559     */
2560    unsigned count = util_last_bit64(shader->bt.samplers_used_mask);
2561 
2562    if (!count)
2563       return;
2564 
2565    /* Assemble the SAMPLER_STATEs into a contiguous table that lives
2566     * in the dynamic state memory zone, so we can point to it via the
2567     * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
2568     */
2569    unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
2570    uint32_t *map =
2571       upload_state(ice->state.dynamic_uploader, &shs->sampler_table, size, 32);
2572    if (unlikely(!map))
2573       return;
2574 
2575    struct pipe_resource *res = shs->sampler_table.res;
2576    struct iris_bo *bo = iris_resource_bo(res);
2577 
2578    iris_record_state_size(ice->state.sizes,
2579                           bo->address + shs->sampler_table.offset, size);
2580 
2581    shs->sampler_table.offset += iris_bo_offset_from_base_address(bo);
2582 
2583    ice->state.need_border_colors &= ~(1 << stage);
2584 
2585    for (int i = 0; i < count; i++) {
2586       struct iris_sampler_state *state = shs->samplers[i];
2587       struct iris_sampler_view *tex = shs->textures[i];
2588 
2589       if (!state) {
2590          memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
2591       } else {
2592          const uint32_t *sampler_state = state->sampler_state;
2593 
2594 #if GFX_VERx10 == 125
2595          if (intel_needs_workaround(screen->devinfo, 14014414195) &&
2596              tex && tex->res->base.b.target == PIPE_TEXTURE_3D) {
2597                sampler_state = state->sampler_state_3d;
2598          }
2599 #endif
2600 
2601          if (!state->needs_border_color) {
2602             memcpy(map, sampler_state, 4 * GENX(SAMPLER_STATE_length));
2603          } else {
2604             ice->state.need_border_colors |= 1 << stage;
2605 
2606             /* We may need to swizzle the border color for format faking.
2607              * A/LA formats are faked as R/RG with 000R or R00G swizzles.
2608              * This means we need to move the border color's A channel into
2609              * the R or G channels so that those read swizzles will move it
2610              * back into A.
2611              */
2612             union pipe_color_union *color = &state->border_color;
2613             union pipe_color_union tmp;
2614             if (tex) {
2615                enum pipe_format internal_format = tex->res->internal_format;
2616 
2617                if (util_format_is_alpha(internal_format)) {
2618                   unsigned char swz[4] = {
2619                      PIPE_SWIZZLE_W, PIPE_SWIZZLE_0,
2620                      PIPE_SWIZZLE_0, PIPE_SWIZZLE_0
2621                   };
2622                   util_format_apply_color_swizzle(&tmp, color, swz, true);
2623                   color = &tmp;
2624                } else if (util_format_is_luminance_alpha(internal_format) &&
2625                           internal_format != PIPE_FORMAT_L8A8_SRGB) {
2626                   unsigned char swz[4] = {
2627                      PIPE_SWIZZLE_X, PIPE_SWIZZLE_W,
2628                      PIPE_SWIZZLE_0, PIPE_SWIZZLE_0
2629                   };
2630                   util_format_apply_color_swizzle(&tmp, color, swz, true);
2631                   color = &tmp;
2632                }
2633             }
2634 
2635             /* Stream out the border color and merge the pointer. */
2636             uint32_t offset = iris_upload_border_color(border_color_pool,
2637                                                        color);
2638 
2639             uint32_t dynamic[GENX(SAMPLER_STATE_length)];
2640             iris_pack_state(GENX(SAMPLER_STATE), dynamic, dyns) {
2641                dyns.BorderColorPointer = offset;
2642             }
2643 
2644             for (uint32_t j = 0; j < GENX(SAMPLER_STATE_length); j++)
2645                map[j] = sampler_state[j] | dynamic[j];
2646          }
2647       }
2648 
2649       map += GENX(SAMPLER_STATE_length);
2650    }
2651 }
2652 
2653 static enum isl_channel_select
fmt_swizzle(const struct iris_format_info * fmt,enum pipe_swizzle swz)2654 fmt_swizzle(const struct iris_format_info *fmt, enum pipe_swizzle swz)
2655 {
2656    switch (swz) {
2657    case PIPE_SWIZZLE_X: return fmt->swizzle.r;
2658    case PIPE_SWIZZLE_Y: return fmt->swizzle.g;
2659    case PIPE_SWIZZLE_Z: return fmt->swizzle.b;
2660    case PIPE_SWIZZLE_W: return fmt->swizzle.a;
2661    case PIPE_SWIZZLE_1: return ISL_CHANNEL_SELECT_ONE;
2662    case PIPE_SWIZZLE_0: return ISL_CHANNEL_SELECT_ZERO;
2663    default: unreachable("invalid swizzle");
2664    }
2665 }
2666 
2667 static void
fill_buffer_surface_state(struct isl_device * isl_dev,struct iris_resource * res,void * map,enum isl_format format,struct isl_swizzle swizzle,unsigned offset,unsigned size,isl_surf_usage_flags_t usage)2668 fill_buffer_surface_state(struct isl_device *isl_dev,
2669                           struct iris_resource *res,
2670                           void *map,
2671                           enum isl_format format,
2672                           struct isl_swizzle swizzle,
2673                           unsigned offset,
2674                           unsigned size,
2675                           isl_surf_usage_flags_t usage)
2676 {
2677    const struct isl_format_layout *fmtl = isl_format_get_layout(format);
2678    const unsigned cpp = format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
2679 
2680    /* The ARB_texture_buffer_specification says:
2681     *
2682     *    "The number of texels in the buffer texture's texel array is given by
2683     *
2684     *       floor(<buffer_size> / (<components> * sizeof(<base_type>)),
2685     *
2686     *     where <buffer_size> is the size of the buffer object, in basic
2687     *     machine units and <components> and <base_type> are the element count
2688     *     and base data type for elements, as specified in Table X.1.  The
2689     *     number of texels in the texel array is then clamped to the
2690     *     implementation-dependent limit MAX_TEXTURE_BUFFER_SIZE_ARB."
2691     *
2692     * We need to clamp the size in bytes to MAX_TEXTURE_BUFFER_SIZE * stride,
2693     * so that when ISL divides by stride to obtain the number of texels, that
2694     * texel count is clamped to MAX_TEXTURE_BUFFER_SIZE.
2695     */
2696    unsigned final_size =
2697       MIN3(size, res->bo->size - res->offset - offset,
2698            IRIS_MAX_TEXTURE_BUFFER_SIZE * cpp);
2699 
2700    isl_buffer_fill_state(isl_dev, map,
2701                          .address = res->bo->address + res->offset + offset,
2702                          .size_B = final_size,
2703                          .format = format,
2704                          .swizzle = swizzle,
2705                          .stride_B = cpp,
2706                          .mocs = iris_mocs(res->bo, isl_dev, usage));
2707 }
2708 
2709 #define SURFACE_STATE_ALIGNMENT 64
2710 
2711 /**
2712  * Allocate several contiguous SURFACE_STATE structures, one for each
2713  * supported auxiliary surface mode.  This only allocates the CPU-side
2714  * copy, they will need to be uploaded later after they're filled in.
2715  */
2716 static void
alloc_surface_states(struct iris_surface_state * surf_state,unsigned aux_usages)2717 alloc_surface_states(struct iris_surface_state *surf_state,
2718                      unsigned aux_usages)
2719 {
2720    enum { surf_size = 4 * GENX(RENDER_SURFACE_STATE_length) };
2721 
2722    /* If this changes, update this to explicitly align pointers */
2723    STATIC_ASSERT(surf_size == SURFACE_STATE_ALIGNMENT);
2724 
2725    assert(aux_usages != 0);
2726 
2727    /* In case we're re-allocating them... */
2728    free(surf_state->cpu);
2729 
2730    surf_state->aux_usages = aux_usages;
2731    surf_state->num_states = util_bitcount(aux_usages);
2732    surf_state->cpu = calloc(surf_state->num_states, surf_size);
2733    surf_state->ref.offset = 0;
2734    pipe_resource_reference(&surf_state->ref.res, NULL);
2735 
2736    assert(surf_state->cpu);
2737 }
2738 
2739 /**
2740  * Upload the CPU side SURFACE_STATEs into a GPU buffer.
2741  */
2742 static void
upload_surface_states(struct u_upload_mgr * mgr,struct iris_surface_state * surf_state)2743 upload_surface_states(struct u_upload_mgr *mgr,
2744                       struct iris_surface_state *surf_state)
2745 {
2746    const unsigned surf_size = 4 * GENX(RENDER_SURFACE_STATE_length);
2747    const unsigned bytes = surf_state->num_states * surf_size;
2748 
2749    void *map =
2750       upload_state(mgr, &surf_state->ref, bytes, SURFACE_STATE_ALIGNMENT);
2751 
2752    surf_state->ref.offset +=
2753       iris_bo_offset_from_base_address(iris_resource_bo(surf_state->ref.res));
2754 
2755    if (map)
2756       memcpy(map, surf_state->cpu, bytes);
2757 }
2758 
2759 /**
2760  * Update resource addresses in a set of SURFACE_STATE descriptors,
2761  * and re-upload them if necessary.
2762  */
2763 static bool
update_surface_state_addrs(struct u_upload_mgr * mgr,struct iris_surface_state * surf_state,struct iris_bo * bo)2764 update_surface_state_addrs(struct u_upload_mgr *mgr,
2765                            struct iris_surface_state *surf_state,
2766                            struct iris_bo *bo)
2767 {
2768    if (surf_state->bo_address == bo->address)
2769       return false;
2770 
2771    STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) % 64 == 0);
2772    STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_bits) == 64);
2773 
2774    uint64_t *ss_addr = (uint64_t *) &surf_state->cpu[GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) / 32];
2775 
2776    /* First, update the CPU copies.  We assume no other fields exist in
2777     * the QWord containing Surface Base Address.
2778     */
2779    for (unsigned i = 0; i < surf_state->num_states; i++) {
2780       *ss_addr = *ss_addr - surf_state->bo_address + bo->address;
2781       ss_addr = ((void *) ss_addr) + SURFACE_STATE_ALIGNMENT;
2782    }
2783 
2784    /* Next, upload the updated copies to a GPU buffer. */
2785    upload_surface_states(mgr, surf_state);
2786 
2787    surf_state->bo_address = bo->address;
2788 
2789    return true;
2790 }
2791 
2792 /* We should only use this function when it's needed to fill out
2793  * surf with information provided by the pipe_(image|sampler)_view.
2794  * This is only necessary for CL extension cl_khr_image2d_from_buffer.
2795  * This is the reason why ISL_SURF_DIM_2D is hardcoded on dim field.
2796  */
2797 static void
fill_surf_for_tex2d_from_buffer(struct isl_device * isl_dev,enum isl_format format,unsigned width,unsigned height,unsigned row_stride,isl_surf_usage_flags_t usage,struct isl_surf * surf)2798 fill_surf_for_tex2d_from_buffer(struct isl_device *isl_dev,
2799                                 enum isl_format format,
2800                                 unsigned width,
2801                                 unsigned height,
2802                                 unsigned row_stride,
2803                                 isl_surf_usage_flags_t usage,
2804                                 struct isl_surf *surf)
2805 {
2806    const struct isl_format_layout *fmtl = isl_format_get_layout(format);
2807    const unsigned cpp = format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
2808 
2809    const struct isl_surf_init_info init_info = {
2810       .dim = ISL_SURF_DIM_2D,
2811       .format = format,
2812       .width = width,
2813       .height = height,
2814       .depth = 1,
2815       .levels = 1,
2816       .array_len = 1,
2817       .samples = 1,
2818       .min_alignment_B = 4,
2819       .row_pitch_B = row_stride * cpp,
2820       .usage = usage,
2821       .tiling_flags = ISL_TILING_LINEAR_BIT,
2822    };
2823 
2824    const bool isl_surf_created_successfully =
2825       isl_surf_init_s(isl_dev, surf, &init_info);
2826 
2827    assert(isl_surf_created_successfully);
2828 }
2829 
2830 static void
fill_surface_state(struct isl_device * isl_dev,void * map,struct iris_resource * res,struct isl_surf * surf,struct isl_view * view,unsigned aux_usage,uint32_t extra_main_offset,uint32_t tile_x_sa,uint32_t tile_y_sa)2831 fill_surface_state(struct isl_device *isl_dev,
2832                    void *map,
2833                    struct iris_resource *res,
2834                    struct isl_surf *surf,
2835                    struct isl_view *view,
2836                    unsigned aux_usage,
2837                    uint32_t extra_main_offset,
2838                    uint32_t tile_x_sa,
2839                    uint32_t tile_y_sa)
2840 {
2841    struct isl_surf_fill_state_info f = {
2842       .surf = surf,
2843       .view = view,
2844       .mocs = iris_mocs(res->bo, isl_dev, view->usage),
2845       .address = res->bo->address + res->offset + extra_main_offset,
2846       .x_offset_sa = tile_x_sa,
2847       .y_offset_sa = tile_y_sa,
2848    };
2849 
2850    if (aux_usage != ISL_AUX_USAGE_NONE) {
2851       f.aux_surf = &res->aux.surf;
2852       f.aux_usage = aux_usage;
2853       f.clear_color = res->aux.clear_color;
2854 
2855       if (aux_usage == ISL_AUX_USAGE_MC)
2856          f.mc_format = iris_format_for_usage(isl_dev->info,
2857                                              res->external_format,
2858                                              surf->usage).fmt;
2859 
2860       if (res->aux.bo)
2861          f.aux_address = res->aux.bo->address + res->aux.offset;
2862 
2863       if (res->aux.clear_color_bo) {
2864          f.clear_address = res->aux.clear_color_bo->address +
2865                            res->aux.clear_color_offset;
2866          f.use_clear_address = isl_dev->info->ver > 9;
2867       }
2868    }
2869 
2870    isl_surf_fill_state_s(isl_dev, map, &f);
2871 }
2872 
2873 static void
fill_surface_states(struct isl_device * isl_dev,struct iris_surface_state * surf_state,struct iris_resource * res,struct isl_surf * surf,struct isl_view * view,uint64_t extra_main_offset,uint32_t tile_x_sa,uint32_t tile_y_sa)2874 fill_surface_states(struct isl_device *isl_dev,
2875                     struct iris_surface_state *surf_state,
2876                     struct iris_resource *res,
2877                     struct isl_surf *surf,
2878                     struct isl_view *view,
2879                     uint64_t extra_main_offset,
2880                     uint32_t tile_x_sa,
2881                     uint32_t tile_y_sa)
2882 {
2883    void *map = surf_state->cpu;
2884    unsigned aux_modes = surf_state->aux_usages;
2885 
2886    while (aux_modes) {
2887       enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);
2888 
2889       fill_surface_state(isl_dev, map, res, surf, view, aux_usage,
2890                          extra_main_offset, tile_x_sa, tile_y_sa);
2891 
2892       map += SURFACE_STATE_ALIGNMENT;
2893    }
2894 }
2895 
2896 /**
2897  * The pipe->create_sampler_view() driver hook.
2898  */
2899 static struct pipe_sampler_view *
iris_create_sampler_view(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_sampler_view * tmpl)2900 iris_create_sampler_view(struct pipe_context *ctx,
2901                          struct pipe_resource *tex,
2902                          const struct pipe_sampler_view *tmpl)
2903 {
2904    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
2905    const struct intel_device_info *devinfo = screen->devinfo;
2906    struct iris_sampler_view *isv = calloc(1, sizeof(struct iris_sampler_view));
2907 
2908    if (!isv)
2909       return NULL;
2910 
2911    /* initialize base object */
2912    isv->base = *tmpl;
2913    isv->base.context = ctx;
2914    isv->base.texture = NULL;
2915    pipe_reference_init(&isv->base.reference, 1);
2916    pipe_resource_reference(&isv->base.texture, tex);
2917 
2918    if (util_format_is_depth_or_stencil(tmpl->format)) {
2919       struct iris_resource *zres, *sres;
2920       const struct util_format_description *desc =
2921          util_format_description(tmpl->format);
2922 
2923       iris_get_depth_stencil_resources(tex, &zres, &sres);
2924 
2925       tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;
2926    }
2927 
2928    isv->res = (struct iris_resource *) tex;
2929 
2930    isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
2931 
2932    if (isv->base.target == PIPE_TEXTURE_CUBE ||
2933        isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
2934       usage |= ISL_SURF_USAGE_CUBE_BIT;
2935 
2936    const struct iris_format_info fmt =
2937       iris_format_for_usage(devinfo, tmpl->format, usage);
2938 
2939    isv->clear_color = isv->res->aux.clear_color;
2940 
2941    isv->view = (struct isl_view) {
2942       .format = fmt.fmt,
2943       .swizzle = (struct isl_swizzle) {
2944          .r = fmt_swizzle(&fmt, tmpl->swizzle_r),
2945          .g = fmt_swizzle(&fmt, tmpl->swizzle_g),
2946          .b = fmt_swizzle(&fmt, tmpl->swizzle_b),
2947          .a = fmt_swizzle(&fmt, tmpl->swizzle_a),
2948       },
2949       .usage = usage,
2950    };
2951 
2952    unsigned aux_usages = 0;
2953 
2954    if ((isv->res->aux.usage == ISL_AUX_USAGE_CCS_D ||
2955         isv->res->aux.usage == ISL_AUX_USAGE_CCS_E ||
2956         isv->res->aux.usage == ISL_AUX_USAGE_FCV_CCS_E) &&
2957        !isl_format_supports_ccs_e(devinfo, isv->view.format)) {
2958       aux_usages = 1 << ISL_AUX_USAGE_NONE;
2959    } else if (isl_aux_usage_has_hiz(isv->res->aux.usage) &&
2960               !iris_sample_with_depth_aux(devinfo, isv->res)) {
2961       aux_usages = 1 << ISL_AUX_USAGE_NONE;
2962    } else {
2963       aux_usages = 1 << ISL_AUX_USAGE_NONE |
2964                    1 << isv->res->aux.usage;
2965    }
2966 
2967    alloc_surface_states(&isv->surface_state, aux_usages);
2968    isv->surface_state.bo_address = isv->res->bo->address;
2969 
2970    /* Fill out SURFACE_STATE for this view. */
2971    if (tmpl->target != PIPE_BUFFER) {
2972       isv->view.base_level = tmpl->u.tex.first_level;
2973       isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
2974 
2975       if (tmpl->target == PIPE_TEXTURE_3D) {
2976          isv->view.base_array_layer = 0;
2977          isv->view.array_len = 1;
2978       } else {
2979 #if GFX_VER < 9
2980          /* Hardware older than skylake ignores this value */
2981          assert(tex->target != PIPE_TEXTURE_3D || !tmpl->u.tex.first_layer);
2982 #endif
2983          isv->view.base_array_layer = tmpl->u.tex.first_layer;
2984          isv->view.array_len =
2985             tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2986       }
2987 
2988       fill_surface_states(&screen->isl_dev, &isv->surface_state, isv->res,
2989                           &isv->res->surf, &isv->view, 0, 0, 0);
2990    } else if (isv->base.is_tex2d_from_buf) {
2991       /* In case it's a 2d image created from a buffer, we should
2992        * use fill_surface_states function with image parameters provided
2993        * by the CL application
2994        */
2995       isv->view.base_array_layer = 0;
2996       isv->view.array_len = 1;
2997 
2998       /* Create temp_surf and fill with values provided by CL application */
2999       struct isl_surf temp_surf;
3000       fill_surf_for_tex2d_from_buffer(&screen->isl_dev, fmt.fmt,
3001                                       isv->base.u.tex2d_from_buf.width,
3002                                       isv->base.u.tex2d_from_buf.height,
3003                                       isv->base.u.tex2d_from_buf.row_stride,
3004                                       usage,
3005                                       &temp_surf);
3006 
3007       fill_surface_states(&screen->isl_dev, &isv->surface_state, isv->res,
3008                           &temp_surf, &isv->view, 0, 0, 0);
3009    } else {
3010       fill_buffer_surface_state(&screen->isl_dev, isv->res,
3011                                 isv->surface_state.cpu,
3012                                 isv->view.format, isv->view.swizzle,
3013                                 tmpl->u.buf.offset, tmpl->u.buf.size,
3014                                 ISL_SURF_USAGE_TEXTURE_BIT);
3015    }
3016 
3017    return &isv->base;
3018 }
3019 
3020 static void
iris_sampler_view_destroy(struct pipe_context * ctx,struct pipe_sampler_view * state)3021 iris_sampler_view_destroy(struct pipe_context *ctx,
3022                           struct pipe_sampler_view *state)
3023 {
3024    struct iris_sampler_view *isv = (void *) state;
3025    pipe_resource_reference(&state->texture, NULL);
3026    pipe_resource_reference(&isv->surface_state.ref.res, NULL);
3027    free(isv->surface_state.cpu);
3028    free(isv);
3029 }
3030 
3031 /**
3032  * The pipe->create_surface() driver hook.
3033  *
3034  * In Gallium nomenclature, "surfaces" are a view of a resource that
3035  * can be bound as a render target or depth/stencil buffer.
3036  */
3037 static struct pipe_surface *
iris_create_surface(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_surface * tmpl)3038 iris_create_surface(struct pipe_context *ctx,
3039                     struct pipe_resource *tex,
3040                     const struct pipe_surface *tmpl)
3041 {
3042    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3043    const struct intel_device_info *devinfo = screen->devinfo;
3044 
3045    isl_surf_usage_flags_t usage = 0;
3046    if (tmpl->writable)
3047       usage = ISL_SURF_USAGE_STORAGE_BIT;
3048    else if (util_format_is_depth_or_stencil(tmpl->format))
3049       usage = ISL_SURF_USAGE_DEPTH_BIT;
3050    else
3051       usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
3052 
3053    const struct iris_format_info fmt =
3054       iris_format_for_usage(devinfo, tmpl->format, usage);
3055 
3056    if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
3057        !isl_format_supports_rendering(devinfo, fmt.fmt)) {
3058       /* Framebuffer validation will reject this invalid case, but it
3059        * hasn't had the opportunity yet.  In the meantime, we need to
3060        * avoid hitting ISL asserts about unsupported formats below.
3061        */
3062       return NULL;
3063    }
3064 
3065    struct iris_surface *surf = calloc(1, sizeof(struct iris_surface));
3066    struct iris_resource *res = (struct iris_resource *) tex;
3067 
3068    if (!surf)
3069       return NULL;
3070 
3071    uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
3072 
3073    struct isl_view *view = &surf->view;
3074    *view = (struct isl_view) {
3075       .format = fmt.fmt,
3076       .base_level = tmpl->u.tex.level,
3077       .levels = 1,
3078       .base_array_layer = tmpl->u.tex.first_layer,
3079       .array_len = array_len,
3080       .swizzle = ISL_SWIZZLE_IDENTITY,
3081       .usage = usage,
3082    };
3083 
3084 #if GFX_VER == 8
3085    struct isl_view *read_view = &surf->read_view;
3086    *read_view = (struct isl_view) {
3087       .format = fmt.fmt,
3088       .base_level = tmpl->u.tex.level,
3089       .levels = 1,
3090       .base_array_layer = tmpl->u.tex.first_layer,
3091       .array_len = array_len,
3092       .swizzle = ISL_SWIZZLE_IDENTITY,
3093       .usage = ISL_SURF_USAGE_TEXTURE_BIT,
3094    };
3095 
3096    struct isl_surf read_surf = res->surf;
3097    uint64_t read_surf_offset_B = 0;
3098    uint32_t read_surf_tile_x_sa = 0, read_surf_tile_y_sa = 0;
3099    if (tex->target == PIPE_TEXTURE_3D && array_len == 1) {
3100       /* The minimum array element field of the surface state structure is
3101        * ignored by the sampler unit for 3D textures on some hardware.  If the
3102        * render buffer is a single slice of a 3D texture, create a 2D texture
3103        * covering that slice.
3104        *
3105        * TODO: This only handles the case where we're rendering to a single
3106        * slice of an array texture.  If we have layered rendering combined
3107        * with non-coherent FB fetch and a non-zero base_array_layer, then
3108        * we're going to run into problems.
3109        *
3110        * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/4904
3111        */
3112       isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
3113                               read_view->base_level,
3114                               0, read_view->base_array_layer,
3115                               &read_surf, &read_surf_offset_B,
3116                               &read_surf_tile_x_sa, &read_surf_tile_y_sa);
3117       read_view->base_level = 0;
3118       read_view->base_array_layer = 0;
3119       assert(read_view->array_len == 1);
3120    } else if (tex->target == PIPE_TEXTURE_1D_ARRAY) {
3121       /* Convert 1D array textures to 2D arrays because shaders always provide
3122        * the array index coordinate at the Z component to avoid recompiles
3123        * when changing the texture target of the framebuffer.
3124        */
3125       assert(read_surf.dim_layout == ISL_DIM_LAYOUT_GFX4_2D);
3126       read_surf.dim = ISL_SURF_DIM_2D;
3127    }
3128 #endif
3129 
3130    struct isl_surf isl_surf = res->surf;
3131    uint64_t offset_B = 0;
3132    uint32_t tile_x_el = 0, tile_y_el = 0;
3133    if (isl_format_is_compressed(res->surf.format)) {
3134       /* The resource has a compressed format, which is not renderable, but we
3135        * have a renderable view format.  We must be attempting to upload
3136        * blocks of compressed data via an uncompressed view.
3137        *
3138        * In this case, we can assume there are no auxiliary surfaces, a single
3139        * miplevel, and that the resource is single-sampled.  Gallium may try
3140        * and create an uncompressed view with multiple layers, however.
3141        */
3142       assert(res->aux.surf.size_B == 0);
3143       assert(res->surf.samples == 1);
3144       assert(view->levels == 1);
3145 
3146       bool ok = isl_surf_get_uncompressed_surf(&screen->isl_dev,
3147                                                &res->surf, view,
3148                                                &isl_surf, view, &offset_B,
3149                                                &tile_x_el, &tile_y_el);
3150 
3151       /* On Broadwell, HALIGN and VALIGN are specified in pixels and are
3152        * hard-coded to align to exactly the block size of the compressed
3153        * texture. This means that, when reinterpreted as a non-compressed
3154        * texture, the tile offsets may be anything.
3155        *
3156        * We need them to be multiples of 4 to be usable in RENDER_SURFACE_STATE,
3157        * so force the state tracker to take fallback paths if they're not.
3158        */
3159 #if GFX_VER == 8
3160       if (tile_x_el % 4 != 0 || tile_y_el % 4 != 0) {
3161          ok = false;
3162       }
3163 #endif
3164 
3165       if (!ok) {
3166          free(surf);
3167          return NULL;
3168       }
3169    }
3170 
3171    surf->clear_color = res->aux.clear_color;
3172 
3173    struct pipe_surface *psurf = &surf->base;
3174    pipe_reference_init(&psurf->reference, 1);
3175    pipe_resource_reference(&psurf->texture, tex);
3176    psurf->context = ctx;
3177    psurf->format = tmpl->format;
3178    psurf->width = isl_surf.logical_level0_px.width;
3179    psurf->height = isl_surf.logical_level0_px.height;
3180    psurf->texture = tex;
3181    psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
3182    psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
3183    psurf->u.tex.level = tmpl->u.tex.level;
3184 
3185    /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
3186    if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
3187                           ISL_SURF_USAGE_STENCIL_BIT))
3188       return psurf;
3189 
3190    /* Fill out a SURFACE_STATE for each possible auxiliary surface mode and
3191     * return the pipe_surface.
3192     */
3193    unsigned aux_usages = 0;
3194 
3195    if ((res->aux.usage == ISL_AUX_USAGE_CCS_E ||
3196         res->aux.usage == ISL_AUX_USAGE_FCV_CCS_E) &&
3197        !isl_format_supports_ccs_e(devinfo, view->format)) {
3198       aux_usages = 1 << ISL_AUX_USAGE_NONE;
3199    } else {
3200       aux_usages = 1 << ISL_AUX_USAGE_NONE |
3201                    1 << res->aux.usage;
3202    }
3203 
3204    alloc_surface_states(&surf->surface_state, aux_usages);
3205    surf->surface_state.bo_address = res->bo->address;
3206    fill_surface_states(&screen->isl_dev, &surf->surface_state, res,
3207                        &isl_surf, view, offset_B, tile_x_el, tile_y_el);
3208 
3209 #if GFX_VER == 8
3210    alloc_surface_states(&surf->surface_state_read, aux_usages);
3211    surf->surface_state_read.bo_address = res->bo->address;
3212    fill_surface_states(&screen->isl_dev, &surf->surface_state_read, res,
3213                        &read_surf, read_view, read_surf_offset_B,
3214                        read_surf_tile_x_sa, read_surf_tile_y_sa);
3215 #endif
3216 
3217    return psurf;
3218 }
3219 
3220 #if GFX_VER < 9
3221 static void
fill_default_image_param(struct isl_image_param * param)3222 fill_default_image_param(struct isl_image_param *param)
3223 {
3224    memset(param, 0, sizeof(*param));
3225    /* Set the swizzling shifts to all-ones to effectively disable swizzling --
3226     * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
3227     * detailed explanation of these parameters.
3228     */
3229    param->swizzling[0] = 0xff;
3230    param->swizzling[1] = 0xff;
3231 }
3232 
3233 static void
fill_buffer_image_param(struct isl_image_param * param,enum pipe_format pfmt,unsigned size)3234 fill_buffer_image_param(struct isl_image_param *param,
3235                         enum pipe_format pfmt,
3236                         unsigned size)
3237 {
3238    const unsigned cpp = util_format_get_blocksize(pfmt);
3239 
3240    fill_default_image_param(param);
3241    param->size[0] = size / cpp;
3242    param->stride[0] = cpp;
3243 }
3244 #else
3245 #define isl_surf_fill_image_param(x, ...)
3246 #define fill_default_image_param(x, ...)
3247 #define fill_buffer_image_param(x, ...)
3248 #endif
3249 
3250 /**
3251  * The pipe->set_shader_images() driver hook.
3252  */
3253 static void
iris_set_shader_images(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * p_images)3254 iris_set_shader_images(struct pipe_context *ctx,
3255                        enum pipe_shader_type p_stage,
3256                        unsigned start_slot, unsigned count,
3257                        unsigned unbind_num_trailing_slots,
3258                        const struct pipe_image_view *p_images)
3259 {
3260    struct iris_context *ice = (struct iris_context *) ctx;
3261    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3262    gl_shader_stage stage = stage_from_pipe(p_stage);
3263    struct iris_shader_state *shs = &ice->state.shaders[stage];
3264 #if GFX_VER == 8
3265    struct iris_genx_state *genx = ice->state.genx;
3266    struct isl_image_param *image_params = genx->shaders[stage].image_param;
3267 #endif
3268 
3269    shs->bound_image_views &=
3270       ~u_bit_consecutive64(start_slot, count + unbind_num_trailing_slots);
3271 
3272    for (unsigned i = 0; i < count; i++) {
3273       struct iris_image_view *iv = &shs->image[start_slot + i];
3274 
3275       if (p_images && p_images[i].resource) {
3276          const struct pipe_image_view *img = &p_images[i];
3277          struct iris_resource *res = (void *) img->resource;
3278 
3279          util_copy_image_view(&iv->base, img);
3280 
3281          shs->bound_image_views |= BITFIELD64_BIT(start_slot + i);
3282 
3283          res->bind_history |= PIPE_BIND_SHADER_IMAGE;
3284          res->bind_stages |= 1 << stage;
3285 
3286          enum isl_format isl_fmt = iris_image_view_get_format(ice, img);
3287 
3288          unsigned aux_usages = 1 << ISL_AUX_USAGE_NONE;
3289 
3290          /* Gfx12+ supports render compression for images */
3291          if (GFX_VER >= 12 && isl_aux_usage_has_ccs_e(res->aux.usage))
3292             aux_usages |= 1 << ISL_AUX_USAGE_CCS_E;
3293 
3294          alloc_surface_states(&iv->surface_state, aux_usages);
3295          iv->surface_state.bo_address = res->bo->address;
3296 
3297          if (res->base.b.target != PIPE_BUFFER) {
3298             struct isl_view view = {
3299                .format = isl_fmt,
3300                .base_level = img->u.tex.level,
3301                .levels = 1,
3302                .base_array_layer = img->u.tex.first_layer,
3303                .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
3304                .swizzle = ISL_SWIZZLE_IDENTITY,
3305                .usage = ISL_SURF_USAGE_STORAGE_BIT,
3306             };
3307 
3308             /* If using untyped fallback. */
3309             if (isl_fmt == ISL_FORMAT_RAW) {
3310                fill_buffer_surface_state(&screen->isl_dev, res,
3311                                          iv->surface_state.cpu,
3312                                          isl_fmt, ISL_SWIZZLE_IDENTITY,
3313                                          0, res->bo->size,
3314                                          ISL_SURF_USAGE_STORAGE_BIT);
3315             } else {
3316                fill_surface_states(&screen->isl_dev, &iv->surface_state, res,
3317                                    &res->surf, &view, 0, 0, 0);
3318             }
3319 
3320             isl_surf_fill_image_param(&screen->isl_dev,
3321                                       &image_params[start_slot + i],
3322                                       &res->surf, &view);
3323          } else if (img->access & PIPE_IMAGE_ACCESS_TEX2D_FROM_BUFFER) {
3324             /* In case it's a 2d image created from a buffer, we should
3325              * use fill_surface_states function with image parameters provided
3326              * by the CL application
3327              */
3328             isl_surf_usage_flags_t usage =  ISL_SURF_USAGE_STORAGE_BIT;
3329             struct isl_view view = {
3330                .format = isl_fmt,
3331                .base_level = 0,
3332                .levels = 1,
3333                .base_array_layer = 0,
3334                .array_len = 1,
3335                .swizzle = ISL_SWIZZLE_IDENTITY,
3336                .usage = usage,
3337             };
3338 
3339             /* Create temp_surf and fill with values provided by CL application */
3340             struct isl_surf temp_surf;
3341             enum isl_format fmt = iris_image_view_get_format(ice, img);
3342             fill_surf_for_tex2d_from_buffer(&screen->isl_dev, fmt,
3343                                             img->u.tex2d_from_buf.width,
3344                                             img->u.tex2d_from_buf.height,
3345                                             img->u.tex2d_from_buf.row_stride,
3346                                             usage,
3347                                             &temp_surf);
3348 
3349             fill_surface_states(&screen->isl_dev, &iv->surface_state, res,
3350                                 &temp_surf, &view, 0, 0, 0);
3351             isl_surf_fill_image_param(&screen->isl_dev,
3352                                       &image_params[start_slot + i],
3353                                       &temp_surf, &view);
3354          } else {
3355             util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset,
3356                            img->u.buf.offset + img->u.buf.size);
3357 
3358             fill_buffer_surface_state(&screen->isl_dev, res,
3359                                       iv->surface_state.cpu,
3360                                       isl_fmt, ISL_SWIZZLE_IDENTITY,
3361                                       img->u.buf.offset, img->u.buf.size,
3362                                       ISL_SURF_USAGE_STORAGE_BIT);
3363             fill_buffer_image_param(&image_params[start_slot + i],
3364                                     img->format, img->u.buf.size);
3365          }
3366 
3367          upload_surface_states(ice->state.surface_uploader, &iv->surface_state);
3368       } else {
3369          pipe_resource_reference(&iv->base.resource, NULL);
3370          pipe_resource_reference(&iv->surface_state.ref.res, NULL);
3371          fill_default_image_param(&image_params[start_slot + i]);
3372       }
3373    }
3374 
3375    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << stage;
3376    ice->state.dirty |=
3377       stage == MESA_SHADER_COMPUTE ? IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3378                                    : IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3379 
3380    /* Broadwell also needs isl_image_params re-uploaded */
3381    if (GFX_VER < 9) {
3382       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << stage;
3383       shs->sysvals_need_upload = true;
3384    }
3385 
3386    if (unbind_num_trailing_slots) {
3387       iris_set_shader_images(ctx, p_stage, start_slot + count,
3388                              unbind_num_trailing_slots, 0, NULL);
3389    }
3390 }
3391 
3392 UNUSED static bool
is_sampler_view_3d(const struct iris_sampler_view * view)3393 is_sampler_view_3d(const struct iris_sampler_view *view)
3394 {
3395    return view && view->res->base.b.target == PIPE_TEXTURE_3D;
3396 }
3397 
3398 /**
3399  * The pipe->set_sampler_views() driver hook.
3400  */
3401 static void
iris_set_sampler_views(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)3402 iris_set_sampler_views(struct pipe_context *ctx,
3403                        enum pipe_shader_type p_stage,
3404                        unsigned start, unsigned count,
3405                        unsigned unbind_num_trailing_slots,
3406                        bool take_ownership,
3407                        struct pipe_sampler_view **views)
3408 {
3409    struct iris_context *ice = (struct iris_context *) ctx;
3410    UNUSED struct iris_screen *screen = (void *) ctx->screen;
3411    UNUSED const struct intel_device_info *devinfo = screen->devinfo;
3412    gl_shader_stage stage = stage_from_pipe(p_stage);
3413    struct iris_shader_state *shs = &ice->state.shaders[stage];
3414    unsigned i;
3415 
3416    if (count == 0 && unbind_num_trailing_slots == 0)
3417       return;
3418 
3419    BITSET_CLEAR_RANGE(shs->bound_sampler_views, start,
3420                       start + count + unbind_num_trailing_slots - 1);
3421 
3422    for (i = 0; i < count; i++) {
3423       struct pipe_sampler_view *pview = views ? views[i] : NULL;
3424       struct iris_sampler_view *view = (void *) pview;
3425 
3426 #if GFX_VERx10 == 125
3427       if (intel_needs_workaround(screen->devinfo, 14014414195)) {
3428          if (is_sampler_view_3d(shs->textures[start + i]) !=
3429              is_sampler_view_3d(view))
3430             ice->state.stage_dirty |= IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
3431       }
3432 #endif
3433 
3434       if (take_ownership) {
3435          pipe_sampler_view_reference((struct pipe_sampler_view **)
3436                                      &shs->textures[start + i], NULL);
3437          shs->textures[start + i] = (struct iris_sampler_view *)pview;
3438       } else {
3439          pipe_sampler_view_reference((struct pipe_sampler_view **)
3440                                      &shs->textures[start + i], pview);
3441       }
3442       if (view) {
3443          view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
3444          view->res->bind_stages |= 1 << stage;
3445 
3446          BITSET_SET(shs->bound_sampler_views, start + i);
3447 
3448          update_surface_state_addrs(ice->state.surface_uploader,
3449                                     &view->surface_state, view->res->bo);
3450       }
3451    }
3452    for (; i < count + unbind_num_trailing_slots; i++) {
3453       pipe_sampler_view_reference((struct pipe_sampler_view **)
3454                                   &shs->textures[start + i], NULL);
3455    }
3456 
3457    ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_BINDINGS_VS << stage);
3458    ice->state.dirty |=
3459       stage == MESA_SHADER_COMPUTE ? IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3460                                    : IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3461 }
3462 
3463 static void
iris_set_compute_resources(struct pipe_context * ctx,unsigned start,unsigned count,struct pipe_surface ** resources)3464 iris_set_compute_resources(struct pipe_context *ctx,
3465                            unsigned start, unsigned count,
3466                            struct pipe_surface **resources)
3467 {
3468    assert(count == 0);
3469 }
3470 
3471 static void
iris_set_global_binding(struct pipe_context * ctx,unsigned start_slot,unsigned count,struct pipe_resource ** resources,uint32_t ** handles)3472 iris_set_global_binding(struct pipe_context *ctx,
3473                         unsigned start_slot, unsigned count,
3474                         struct pipe_resource **resources,
3475                         uint32_t **handles)
3476 {
3477    struct iris_context *ice = (struct iris_context *) ctx;
3478 
3479    assert(start_slot + count <= IRIS_MAX_GLOBAL_BINDINGS);
3480    for (unsigned i = 0; i < count; i++) {
3481       if (resources && resources[i]) {
3482          pipe_resource_reference(&ice->state.global_bindings[start_slot + i],
3483                                  resources[i]);
3484 
3485          struct iris_resource *res = (void *) resources[i];
3486          assert(res->base.b.target == PIPE_BUFFER);
3487          util_range_add(&res->base.b, &res->valid_buffer_range,
3488                         0, res->base.b.width0);
3489 
3490          uint64_t addr = 0;
3491          memcpy(&addr, handles[i], sizeof(addr));
3492          addr += res->bo->address + res->offset;
3493          memcpy(handles[i], &addr, sizeof(addr));
3494       } else {
3495          pipe_resource_reference(&ice->state.global_bindings[start_slot + i],
3496                                  NULL);
3497       }
3498    }
3499 
3500    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_CS;
3501 }
3502 
3503 /**
3504  * The pipe->set_tess_state() driver hook.
3505  */
3506 static void
iris_set_tess_state(struct pipe_context * ctx,const float default_outer_level[4],const float default_inner_level[2])3507 iris_set_tess_state(struct pipe_context *ctx,
3508                     const float default_outer_level[4],
3509                     const float default_inner_level[2])
3510 {
3511    struct iris_context *ice = (struct iris_context *) ctx;
3512    struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
3513 
3514    memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
3515    memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
3516 
3517    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_TCS;
3518    shs->sysvals_need_upload = true;
3519 }
3520 
3521 static void
iris_set_patch_vertices(struct pipe_context * ctx,uint8_t patch_vertices)3522 iris_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
3523 {
3524    struct iris_context *ice = (struct iris_context *) ctx;
3525 
3526    ice->state.patch_vertices = patch_vertices;
3527 }
3528 
3529 static void
iris_surface_destroy(struct pipe_context * ctx,struct pipe_surface * p_surf)3530 iris_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
3531 {
3532    struct iris_surface *surf = (void *) p_surf;
3533    pipe_resource_reference(&p_surf->texture, NULL);
3534    pipe_resource_reference(&surf->surface_state.ref.res, NULL);
3535    pipe_resource_reference(&surf->surface_state_read.ref.res, NULL);
3536    free(surf->surface_state.cpu);
3537    free(surf->surface_state_read.cpu);
3538    free(surf);
3539 }
3540 
3541 static void
iris_set_clip_state(struct pipe_context * ctx,const struct pipe_clip_state * state)3542 iris_set_clip_state(struct pipe_context *ctx,
3543                     const struct pipe_clip_state *state)
3544 {
3545    struct iris_context *ice = (struct iris_context *) ctx;
3546    struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
3547    struct iris_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
3548    struct iris_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
3549 
3550    memcpy(&ice->state.clip_planes, state, sizeof(*state));
3551 
3552    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS |
3553                              IRIS_STAGE_DIRTY_CONSTANTS_GS |
3554                              IRIS_STAGE_DIRTY_CONSTANTS_TES;
3555    shs->sysvals_need_upload = true;
3556    gshs->sysvals_need_upload = true;
3557    tshs->sysvals_need_upload = true;
3558 }
3559 
3560 /**
3561  * The pipe->set_polygon_stipple() driver hook.
3562  */
3563 static void
iris_set_polygon_stipple(struct pipe_context * ctx,const struct pipe_poly_stipple * state)3564 iris_set_polygon_stipple(struct pipe_context *ctx,
3565                          const struct pipe_poly_stipple *state)
3566 {
3567    struct iris_context *ice = (struct iris_context *) ctx;
3568    memcpy(&ice->state.poly_stipple, state, sizeof(*state));
3569    ice->state.dirty |= IRIS_DIRTY_POLYGON_STIPPLE;
3570 }
3571 
3572 /**
3573  * The pipe->set_sample_mask() driver hook.
3574  */
3575 static void
iris_set_sample_mask(struct pipe_context * ctx,unsigned sample_mask)3576 iris_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
3577 {
3578    struct iris_context *ice = (struct iris_context *) ctx;
3579 
3580    /* We only support 16x MSAA, so we have 16 bits of sample maks.
3581     * st/mesa may pass us 0xffffffff though, meaning "enable all samples".
3582     */
3583    ice->state.sample_mask = sample_mask & 0xffff;
3584    ice->state.dirty |= IRIS_DIRTY_SAMPLE_MASK;
3585 }
3586 
3587 /**
3588  * The pipe->set_scissor_states() driver hook.
3589  *
3590  * This corresponds to our SCISSOR_RECT state structures.  It's an
3591  * exact match, so we just store them, and memcpy them out later.
3592  */
3593 static void
iris_set_scissor_states(struct pipe_context * ctx,unsigned start_slot,unsigned num_scissors,const struct pipe_scissor_state * rects)3594 iris_set_scissor_states(struct pipe_context *ctx,
3595                         unsigned start_slot,
3596                         unsigned num_scissors,
3597                         const struct pipe_scissor_state *rects)
3598 {
3599    struct iris_context *ice = (struct iris_context *) ctx;
3600 
3601    for (unsigned i = 0; i < num_scissors; i++) {
3602       if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
3603          /* If the scissor was out of bounds and got clamped to 0 width/height
3604           * at the bounds, the subtraction of 1 from maximums could produce a
3605           * negative number and thus not clip anything.  Instead, just provide
3606           * a min > max scissor inside the bounds, which produces the expected
3607           * no rendering.
3608           */
3609          ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3610             .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
3611          };
3612       } else {
3613          ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3614             .minx = rects[i].minx,     .miny = rects[i].miny,
3615             .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
3616          };
3617       }
3618    }
3619 
3620    ice->state.dirty |= IRIS_DIRTY_SCISSOR_RECT;
3621 }
3622 
3623 /**
3624  * The pipe->set_stencil_ref() driver hook.
3625  *
3626  * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
3627  */
3628 static void
iris_set_stencil_ref(struct pipe_context * ctx,const struct pipe_stencil_ref state)3629 iris_set_stencil_ref(struct pipe_context *ctx,
3630                      const struct pipe_stencil_ref state)
3631 {
3632    struct iris_context *ice = (struct iris_context *) ctx;
3633    memcpy(&ice->state.stencil_ref, &state, sizeof(state));
3634    if (GFX_VER >= 12)
3635       ice->state.dirty |= IRIS_DIRTY_STENCIL_REF;
3636    else if (GFX_VER >= 9)
3637       ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL;
3638    else
3639       ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
3640 }
3641 
3642 static float
viewport_extent(const struct pipe_viewport_state * state,int axis,float sign)3643 viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)
3644 {
3645    return copysignf(state->scale[axis], sign) + state->translate[axis];
3646 }
3647 
3648 /**
3649  * The pipe->set_viewport_states() driver hook.
3650  *
3651  * This corresponds to our SF_CLIP_VIEWPORT states.  We can't calculate
3652  * the guardband yet, as we need the framebuffer dimensions, but we can
3653  * at least fill out the rest.
3654  */
3655 static void
iris_set_viewport_states(struct pipe_context * ctx,unsigned start_slot,unsigned count,const struct pipe_viewport_state * states)3656 iris_set_viewport_states(struct pipe_context *ctx,
3657                          unsigned start_slot,
3658                          unsigned count,
3659                          const struct pipe_viewport_state *states)
3660 {
3661    struct iris_context *ice = (struct iris_context *) ctx;
3662    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3663 
3664    memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
3665 
3666    /* Fix depth test misrenderings by lowering translated depth range */
3667    if (screen->driconf.lower_depth_range_rate != 1.0f)
3668       ice->state.viewports[start_slot].translate[2] *=
3669          screen->driconf.lower_depth_range_rate;
3670 
3671    ice->state.dirty |= IRIS_DIRTY_SF_CL_VIEWPORT;
3672 
3673    if (ice->state.cso_rast && (!ice->state.cso_rast->depth_clip_near ||
3674                                !ice->state.cso_rast->depth_clip_far))
3675       ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
3676 }
3677 
3678 /**
3679  * The pipe->set_framebuffer_state() driver hook.
3680  *
3681  * Sets the current draw FBO, including color render targets, depth,
3682  * and stencil buffers.
3683  */
3684 static void
iris_set_framebuffer_state(struct pipe_context * ctx,const struct pipe_framebuffer_state * state)3685 iris_set_framebuffer_state(struct pipe_context *ctx,
3686                            const struct pipe_framebuffer_state *state)
3687 {
3688    struct iris_context *ice = (struct iris_context *) ctx;
3689    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3690    const struct intel_device_info *devinfo = screen->devinfo;
3691    struct isl_device *isl_dev = &screen->isl_dev;
3692    struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
3693    struct iris_resource *zres;
3694    struct iris_resource *stencil_res;
3695 
3696    unsigned samples = util_framebuffer_get_num_samples(state);
3697    unsigned layers = util_framebuffer_get_num_layers(state);
3698 
3699    if (cso->samples != samples) {
3700       ice->state.dirty |= IRIS_DIRTY_MULTISAMPLE;
3701 
3702       /* We need to toggle 3DSTATE_PS::32 Pixel Dispatch Enable */
3703       if (GFX_VER >= 9 && (cso->samples == 16 || samples == 16))
3704          ice->state.stage_dirty |= IRIS_STAGE_DIRTY_FS;
3705 
3706       /* We may need to emit blend state for Wa_14018912822. */
3707       if ((cso->samples > 1) != (samples > 1) &&
3708           intel_needs_workaround(devinfo, 14018912822)) {
3709          ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
3710          ice->state.dirty |= IRIS_DIRTY_PS_BLEND;
3711       }
3712    }
3713 
3714    if (cso->nr_cbufs != state->nr_cbufs) {
3715       ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
3716    }
3717 
3718    if ((cso->layers == 0) != (layers == 0)) {
3719       ice->state.dirty |= IRIS_DIRTY_CLIP;
3720    }
3721 
3722    if (cso->width != state->width || cso->height != state->height) {
3723       ice->state.dirty |= IRIS_DIRTY_SF_CL_VIEWPORT;
3724    }
3725 
3726    if (cso->zsbuf || state->zsbuf) {
3727       ice->state.dirty |= IRIS_DIRTY_DEPTH_BUFFER;
3728    }
3729 
3730    bool has_integer_rt = false;
3731    for (unsigned i = 0; i < state->nr_cbufs; i++) {
3732       if (state->cbufs[i]) {
3733          enum isl_format ifmt =
3734             isl_format_for_pipe_format(state->cbufs[i]->format);
3735          has_integer_rt |= isl_format_has_int_channel(ifmt);
3736       }
3737    }
3738 
3739    /* 3DSTATE_RASTER::AntialiasingEnable */
3740    if (has_integer_rt != ice->state.has_integer_rt ||
3741        cso->samples != samples) {
3742       ice->state.dirty |= IRIS_DIRTY_RASTER;
3743    }
3744 
3745    util_copy_framebuffer_state(cso, state);
3746    cso->samples = samples;
3747    cso->layers = layers;
3748 
3749    ice->state.has_integer_rt = has_integer_rt;
3750 
3751    struct iris_depth_buffer_state *cso_z = &ice->state.genx->depth_buffer;
3752 
3753    struct isl_view view = {
3754       .base_level = 0,
3755       .levels = 1,
3756       .base_array_layer = 0,
3757       .array_len = 1,
3758       .swizzle = ISL_SWIZZLE_IDENTITY,
3759    };
3760 
3761    struct isl_depth_stencil_hiz_emit_info info = {
3762       .view = &view,
3763       .mocs = iris_mocs(NULL, isl_dev, ISL_SURF_USAGE_DEPTH_BIT),
3764    };
3765 
3766    if (cso->zsbuf) {
3767       iris_get_depth_stencil_resources(cso->zsbuf->texture, &zres,
3768                                        &stencil_res);
3769 
3770       view.base_level = cso->zsbuf->u.tex.level;
3771       view.base_array_layer = cso->zsbuf->u.tex.first_layer;
3772       view.array_len =
3773          cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
3774 
3775       if (zres) {
3776          view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
3777 
3778          info.depth_surf = &zres->surf;
3779          info.depth_address = zres->bo->address + zres->offset;
3780          info.mocs = iris_mocs(zres->bo, isl_dev, view.usage);
3781 
3782          view.format = zres->surf.format;
3783 
3784          if (iris_resource_level_has_hiz(devinfo, zres, view.base_level)) {
3785             info.hiz_usage = zres->aux.usage;
3786             info.hiz_surf = &zres->aux.surf;
3787             info.hiz_address = zres->aux.bo->address + zres->aux.offset;
3788          }
3789 
3790          ice->state.hiz_usage = info.hiz_usage;
3791       }
3792 
3793       if (stencil_res) {
3794          view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
3795          info.stencil_aux_usage = stencil_res->aux.usage;
3796          info.stencil_surf = &stencil_res->surf;
3797          info.stencil_address = stencil_res->bo->address + stencil_res->offset;
3798          if (!zres) {
3799             view.format = stencil_res->surf.format;
3800             info.mocs = iris_mocs(stencil_res->bo, isl_dev, view.usage);
3801          }
3802       }
3803    }
3804 
3805    isl_emit_depth_stencil_hiz_s(isl_dev, cso_z->packets, &info);
3806 
3807    /* Make a null surface for unbound buffers */
3808    void *null_surf_map =
3809       upload_state(ice->state.surface_uploader, &ice->state.null_fb,
3810                    4 * GENX(RENDER_SURFACE_STATE_length), 64);
3811    isl_null_fill_state(&screen->isl_dev, null_surf_map,
3812                        .size = isl_extent3d(MAX2(cso->width, 1),
3813                                             MAX2(cso->height, 1),
3814                                             cso->layers ? cso->layers : 1));
3815    ice->state.null_fb.offset +=
3816       iris_bo_offset_from_base_address(iris_resource_bo(ice->state.null_fb.res));
3817 
3818    /* Render target change */
3819    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_FS;
3820 
3821    ice->state.dirty |= IRIS_DIRTY_RENDER_BUFFER;
3822 
3823    ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3824 
3825    ice->state.stage_dirty |=
3826       ice->state.stage_dirty_for_nos[IRIS_NOS_FRAMEBUFFER];
3827 
3828    if (GFX_VER == 8)
3829       ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
3830 }
3831 
3832 /**
3833  * The pipe->set_constant_buffer() driver hook.
3834  *
3835  * This uploads any constant data in user buffers, and references
3836  * any UBO resources containing constant data.
3837  */
3838 static void
iris_set_constant_buffer(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned index,bool take_ownership,const struct pipe_constant_buffer * input)3839 iris_set_constant_buffer(struct pipe_context *ctx,
3840                          enum pipe_shader_type p_stage, unsigned index,
3841                          bool take_ownership,
3842                          const struct pipe_constant_buffer *input)
3843 {
3844    struct iris_context *ice = (struct iris_context *) ctx;
3845    gl_shader_stage stage = stage_from_pipe(p_stage);
3846    struct iris_shader_state *shs = &ice->state.shaders[stage];
3847    struct pipe_shader_buffer *cbuf = &shs->constbuf[index];
3848 
3849    /* TODO: Only do this if the buffer changes? */
3850    pipe_resource_reference(&shs->constbuf_surf_state[index].res, NULL);
3851 
3852    if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
3853       shs->bound_cbufs |= 1u << index;
3854 
3855       if (input->user_buffer) {
3856          void *map = NULL;
3857          pipe_resource_reference(&cbuf->buffer, NULL);
3858          u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
3859                         &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3860 
3861          if (!cbuf->buffer) {
3862             /* Allocation was unsuccessful - just unbind */
3863             iris_set_constant_buffer(ctx, p_stage, index, false, NULL);
3864             return;
3865          }
3866 
3867          assert(map);
3868          memcpy(map, input->user_buffer, input->buffer_size);
3869       } else if (input->buffer) {
3870          if (cbuf->buffer != input->buffer) {
3871             ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
3872                                  IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
3873             shs->dirty_cbufs |= 1u << index;
3874          }
3875 
3876          if (take_ownership) {
3877             pipe_resource_reference(&cbuf->buffer, NULL);
3878             cbuf->buffer = input->buffer;
3879          } else {
3880             pipe_resource_reference(&cbuf->buffer, input->buffer);
3881          }
3882 
3883          cbuf->buffer_offset = input->buffer_offset;
3884       }
3885 
3886       cbuf->buffer_size =
3887          MIN2(input->buffer_size,
3888               iris_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
3889 
3890       struct iris_resource *res = (void *) cbuf->buffer;
3891       res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
3892       res->bind_stages |= 1 << stage;
3893    } else {
3894       shs->bound_cbufs &= ~(1u << index);
3895       pipe_resource_reference(&cbuf->buffer, NULL);
3896    }
3897 
3898    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << stage;
3899 }
3900 
3901 static void
upload_sysvals(struct iris_context * ice,gl_shader_stage stage,const struct pipe_grid_info * grid)3902 upload_sysvals(struct iris_context *ice,
3903                gl_shader_stage stage,
3904                const struct pipe_grid_info *grid)
3905 {
3906    UNUSED struct iris_genx_state *genx = ice->state.genx;
3907    struct iris_shader_state *shs = &ice->state.shaders[stage];
3908 
3909    struct iris_compiled_shader *shader = ice->shaders.prog[stage];
3910    if (!shader || (shader->num_system_values == 0 &&
3911                    shader->kernel_input_size == 0))
3912       return;
3913 
3914    assert(shader->num_cbufs > 0);
3915 
3916    unsigned sysval_cbuf_index = shader->num_cbufs - 1;
3917    struct pipe_shader_buffer *cbuf = &shs->constbuf[sysval_cbuf_index];
3918    unsigned system_values_start =
3919       ALIGN(shader->kernel_input_size, sizeof(uint32_t));
3920    unsigned upload_size = system_values_start +
3921                           shader->num_system_values * sizeof(uint32_t);
3922    void *map = NULL;
3923 
3924    assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
3925    u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
3926                   &cbuf->buffer_offset, &cbuf->buffer, &map);
3927 
3928    if (shader->kernel_input_size > 0)
3929       memcpy(map, grid->input, shader->kernel_input_size);
3930 
3931    uint32_t *sysval_map = map + system_values_start;
3932    for (int i = 0; i < shader->num_system_values; i++) {
3933       uint32_t sysval = shader->system_values[i];
3934       uint32_t value = 0;
3935 
3936 #if GFX_VER >= 9
3937       #define COMPILER(x) BRW_##x
3938 #else
3939       #define COMPILER(x) ELK_##x
3940 #endif
3941 
3942       if (ELK_PARAM_DOMAIN(sysval) == ELK_PARAM_DOMAIN_IMAGE) {
3943 #if GFX_VER == 8
3944          unsigned img = ELK_PARAM_IMAGE_IDX(sysval);
3945          unsigned offset = ELK_PARAM_IMAGE_OFFSET(sysval);
3946          struct isl_image_param *param =
3947             &genx->shaders[stage].image_param[img];
3948 
3949          assert(offset < sizeof(struct isl_image_param));
3950          value = ((uint32_t *) param)[offset];
3951 #endif
3952       } else if (sysval == COMPILER(PARAM_BUILTIN_ZERO)) {
3953          value = 0;
3954       } else if (COMPILER(PARAM_BUILTIN_IS_CLIP_PLANE(sysval))) {
3955          int plane = COMPILER(PARAM_BUILTIN_CLIP_PLANE_IDX(sysval));
3956          int comp  = COMPILER(PARAM_BUILTIN_CLIP_PLANE_COMP(sysval));
3957          value = fui(ice->state.clip_planes.ucp[plane][comp]);
3958       } else if (sysval == COMPILER(PARAM_BUILTIN_PATCH_VERTICES_IN)) {
3959          if (stage == MESA_SHADER_TESS_CTRL) {
3960             value = ice->state.vertices_per_patch;
3961          } else {
3962             assert(stage == MESA_SHADER_TESS_EVAL);
3963             const struct shader_info *tcs_info =
3964                iris_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
3965             if (tcs_info)
3966                value = tcs_info->tess.tcs_vertices_out;
3967             else
3968                value = ice->state.vertices_per_patch;
3969          }
3970       } else if (sysval >= COMPILER(PARAM_BUILTIN_TESS_LEVEL_OUTER_X) &&
3971                  sysval <= COMPILER(PARAM_BUILTIN_TESS_LEVEL_OUTER_W)) {
3972          unsigned i = sysval - COMPILER(PARAM_BUILTIN_TESS_LEVEL_OUTER_X);
3973          value = fui(ice->state.default_outer_level[i]);
3974       } else if (sysval == COMPILER(PARAM_BUILTIN_TESS_LEVEL_INNER_X)) {
3975          value = fui(ice->state.default_inner_level[0]);
3976       } else if (sysval == COMPILER(PARAM_BUILTIN_TESS_LEVEL_INNER_Y)) {
3977          value = fui(ice->state.default_inner_level[1]);
3978       } else if (sysval >= COMPILER(PARAM_BUILTIN_WORK_GROUP_SIZE_X) &&
3979                  sysval <= COMPILER(PARAM_BUILTIN_WORK_GROUP_SIZE_Z)) {
3980          unsigned i = sysval - COMPILER(PARAM_BUILTIN_WORK_GROUP_SIZE_X);
3981          value = ice->state.last_block[i];
3982       } else if (sysval == COMPILER(PARAM_BUILTIN_WORK_DIM)) {
3983          value = grid->work_dim;
3984       } else {
3985          assert(!"unhandled system value");
3986       }
3987 
3988       *sysval_map++ = value;
3989    }
3990 
3991    cbuf->buffer_size = upload_size;
3992    iris_upload_ubo_ssbo_surf_state(ice, cbuf,
3993                                    &shs->constbuf_surf_state[sysval_cbuf_index],
3994                                    ISL_SURF_USAGE_CONSTANT_BUFFER_BIT);
3995 
3996    shs->sysvals_need_upload = false;
3997 }
3998 
3999 /**
4000  * The pipe->set_shader_buffers() driver hook.
4001  *
4002  * This binds SSBOs and ABOs.  Unfortunately, we need to stream out
4003  * SURFACE_STATE here, as the buffer offset may change each time.
4004  */
4005 static void
iris_set_shader_buffers(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)4006 iris_set_shader_buffers(struct pipe_context *ctx,
4007                         enum pipe_shader_type p_stage,
4008                         unsigned start_slot, unsigned count,
4009                         const struct pipe_shader_buffer *buffers,
4010                         unsigned writable_bitmask)
4011 {
4012    struct iris_context *ice = (struct iris_context *) ctx;
4013    gl_shader_stage stage = stage_from_pipe(p_stage);
4014    struct iris_shader_state *shs = &ice->state.shaders[stage];
4015 
4016    unsigned modified_bits = u_bit_consecutive(start_slot, count);
4017 
4018    shs->bound_ssbos &= ~modified_bits;
4019    shs->writable_ssbos &= ~modified_bits;
4020    shs->writable_ssbos |= writable_bitmask << start_slot;
4021 
4022    for (unsigned i = 0; i < count; i++) {
4023       if (buffers && buffers[i].buffer) {
4024          struct iris_resource *res = (void *) buffers[i].buffer;
4025          struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
4026          struct iris_state_ref *surf_state =
4027             &shs->ssbo_surf_state[start_slot + i];
4028          pipe_resource_reference(&ssbo->buffer, &res->base.b);
4029          ssbo->buffer_offset = buffers[i].buffer_offset;
4030          ssbo->buffer_size =
4031             MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
4032 
4033          shs->bound_ssbos |= 1 << (start_slot + i);
4034 
4035          isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
4036 
4037          iris_upload_ubo_ssbo_surf_state(ice, ssbo, surf_state, usage);
4038 
4039          res->bind_history |= PIPE_BIND_SHADER_BUFFER;
4040          res->bind_stages |= 1 << stage;
4041 
4042          util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
4043                         ssbo->buffer_offset + ssbo->buffer_size);
4044       } else {
4045          pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
4046          pipe_resource_reference(&shs->ssbo_surf_state[start_slot + i].res,
4047                                  NULL);
4048       }
4049    }
4050 
4051    ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
4052                         IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
4053    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << stage;
4054 }
4055 
4056 static void
iris_delete_state(struct pipe_context * ctx,void * state)4057 iris_delete_state(struct pipe_context *ctx, void *state)
4058 {
4059    free(state);
4060 }
4061 
4062 /**
4063  * The pipe->set_vertex_buffers() driver hook.
4064  *
4065  * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
4066  */
4067 static void
iris_set_vertex_buffers(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_buffer * buffers)4068 iris_set_vertex_buffers(struct pipe_context *ctx,
4069                         unsigned count,
4070                         const struct pipe_vertex_buffer *buffers)
4071 {
4072    struct iris_context *ice = (struct iris_context *) ctx;
4073    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
4074    struct iris_genx_state *genx = ice->state.genx;
4075 
4076    unsigned last_count = util_last_bit64(ice->state.bound_vertex_buffers);
4077    ice->state.bound_vertex_buffers = 0;
4078 
4079    for (unsigned i = 0; i < count; i++) {
4080       const struct pipe_vertex_buffer *buffer = buffers ? &buffers[i] : NULL;
4081       struct iris_vertex_buffer_state *state =
4082          &genx->vertex_buffers[i];
4083 
4084       if (!buffer) {
4085          pipe_resource_reference(&state->resource, NULL);
4086          continue;
4087       }
4088 
4089       /* We may see user buffers that are NULL bindings. */
4090       assert(!(buffer->is_user_buffer && buffer->buffer.user != NULL));
4091 
4092       if (buffer->buffer.resource &&
4093           state->resource != buffer->buffer.resource)
4094          ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFER_FLUSHES;
4095 
4096       pipe_resource_reference(&state->resource, NULL);
4097       state->resource = buffer->buffer.resource;
4098 
4099       struct iris_resource *res = (void *) state->resource;
4100 
4101       state->offset = (int) buffer->buffer_offset;
4102 
4103       if (res) {
4104          ice->state.bound_vertex_buffers |= 1ull << i;
4105          res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
4106       }
4107 
4108       iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
4109          vb.VertexBufferIndex = i;
4110          vb.AddressModifyEnable = true;
4111          /* vb.BufferPitch is merged in dynamically from VE state later */
4112          if (res) {
4113             vb.BufferSize = res->base.b.width0 - (int) buffer->buffer_offset;
4114             vb.BufferStartingAddress =
4115                ro_bo(NULL, res->bo->address + (int) buffer->buffer_offset);
4116             vb.MOCS = iris_mocs(res->bo, &screen->isl_dev,
4117                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
4118 #if GFX_VER >= 12
4119             vb.L3BypassDisable       = true;
4120 #endif
4121          } else {
4122             vb.NullVertexBuffer = true;
4123             vb.MOCS = iris_mocs(NULL, &screen->isl_dev,
4124                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
4125          }
4126       }
4127    }
4128 
4129    for (unsigned i = count; i < last_count; i++) {
4130       struct iris_vertex_buffer_state *state =
4131          &genx->vertex_buffers[i];
4132 
4133       pipe_resource_reference(&state->resource, NULL);
4134    }
4135 
4136    ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS;
4137 }
4138 
4139 /**
4140  * Gallium CSO for vertex elements.
4141  */
4142 struct iris_vertex_element_state {
4143    uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
4144    uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];
4145    uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
4146    uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)];
4147    uint32_t stride[PIPE_MAX_ATTRIBS];
4148    unsigned vb_count;
4149    unsigned count;
4150 };
4151 
4152 /**
4153  * The pipe->create_vertex_elements_state() driver hook.
4154  *
4155  * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
4156  * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
4157  * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
4158  * needed. In these cases we will need information available at draw time.
4159  * We setup edgeflag_ve and edgeflag_vfi as alternatives last
4160  * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
4161  * draw time if we detect that EdgeFlag is needed by the Vertex Shader.
4162  */
4163 static void *
iris_create_vertex_elements(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * state)4164 iris_create_vertex_elements(struct pipe_context *ctx,
4165                             unsigned count,
4166                             const struct pipe_vertex_element *state)
4167 {
4168    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
4169    const struct intel_device_info *devinfo = screen->devinfo;
4170    struct iris_vertex_element_state *cso =
4171       calloc(1, sizeof(struct iris_vertex_element_state));
4172 
4173    cso->count = count;
4174    cso->vb_count = 0;
4175 
4176    iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
4177       ve.DWordLength =
4178          1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
4179    }
4180 
4181    uint32_t *ve_pack_dest = &cso->vertex_elements[1];
4182    uint32_t *vfi_pack_dest = cso->vf_instancing;
4183 
4184    if (count == 0) {
4185       iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
4186          ve.Valid = true;
4187          ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
4188          ve.Component0Control = VFCOMP_STORE_0;
4189          ve.Component1Control = VFCOMP_STORE_0;
4190          ve.Component2Control = VFCOMP_STORE_0;
4191          ve.Component3Control = VFCOMP_STORE_1_FP;
4192       }
4193 
4194       iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
4195       }
4196    }
4197 
4198    for (int i = 0; i < count; i++) {
4199       const struct iris_format_info fmt =
4200          iris_format_for_usage(devinfo, state[i].src_format, 0);
4201       unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
4202                            VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
4203 
4204       switch (isl_format_get_num_channels(fmt.fmt)) {
4205       case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
4206       case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
4207       case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
4208       case 3:
4209          comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
4210                                                        : VFCOMP_STORE_1_FP;
4211          break;
4212       }
4213       iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
4214          ve.EdgeFlagEnable = false;
4215          ve.VertexBufferIndex = state[i].vertex_buffer_index;
4216          ve.Valid = true;
4217          ve.SourceElementOffset = state[i].src_offset;
4218          ve.SourceElementFormat = fmt.fmt;
4219          ve.Component0Control = comp[0];
4220          ve.Component1Control = comp[1];
4221          ve.Component2Control = comp[2];
4222          ve.Component3Control = comp[3];
4223       }
4224 
4225       iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
4226          vi.VertexElementIndex = i;
4227          vi.InstancingEnable = state[i].instance_divisor > 0;
4228          vi.InstanceDataStepRate = state[i].instance_divisor;
4229       }
4230 
4231       ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
4232       vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);
4233       cso->stride[state[i].vertex_buffer_index] = state[i].src_stride;
4234       cso->vb_count = MAX2(state[i].vertex_buffer_index + 1, cso->vb_count);
4235    }
4236 
4237    /* An alternative version of the last VE and VFI is stored so it
4238     * can be used at draw time in case Vertex Shader uses EdgeFlag
4239     */
4240    if (count) {
4241       const unsigned edgeflag_index = count - 1;
4242       const struct iris_format_info fmt =
4243          iris_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
4244       iris_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
4245          ve.EdgeFlagEnable = true ;
4246          ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
4247          ve.Valid = true;
4248          ve.SourceElementOffset = state[edgeflag_index].src_offset;
4249          ve.SourceElementFormat = fmt.fmt;
4250          ve.Component0Control = VFCOMP_STORE_SRC;
4251          ve.Component1Control = VFCOMP_STORE_0;
4252          ve.Component2Control = VFCOMP_STORE_0;
4253          ve.Component3Control = VFCOMP_STORE_0;
4254       }
4255       iris_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) {
4256          /* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled
4257           * at draw time, as it should change if SGVs are emitted.
4258           */
4259          vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0;
4260          vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor;
4261       }
4262    }
4263 
4264    return cso;
4265 }
4266 
4267 /**
4268  * The pipe->bind_vertex_elements_state() driver hook.
4269  */
4270 static void
iris_bind_vertex_elements_state(struct pipe_context * ctx,void * state)4271 iris_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
4272 {
4273    struct iris_context *ice = (struct iris_context *) ctx;
4274    struct iris_vertex_element_state *old_cso = ice->state.cso_vertex_elements;
4275    struct iris_vertex_element_state *new_cso = state;
4276 
4277    /* 3DSTATE_VF_SGVs overrides the last VE, so if the count is changing,
4278     * we need to re-emit it to ensure we're overriding the right one.
4279     */
4280    if (new_cso && cso_changed(count))
4281       ice->state.dirty |= IRIS_DIRTY_VF_SGVS;
4282 
4283    ice->state.cso_vertex_elements = state;
4284    ice->state.dirty |= IRIS_DIRTY_VERTEX_ELEMENTS;
4285    if (new_cso) {
4286       /* re-emit vertex buffer state if stride changes */
4287       if (cso_changed(vb_count) ||
4288           cso_changed_memcmp_elts(stride, new_cso->vb_count))
4289          ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS;
4290    }
4291 }
4292 
4293 /**
4294  * The pipe->create_stream_output_target() driver hook.
4295  *
4296  * "Target" here refers to a destination buffer.  We translate this into
4297  * a 3DSTATE_SO_BUFFER packet.  We can handle most fields, but don't yet
4298  * know which buffer this represents, or whether we ought to zero the
4299  * write-offsets, or append.  Those are handled in the set() hook.
4300  */
4301 static struct pipe_stream_output_target *
iris_create_stream_output_target(struct pipe_context * ctx,struct pipe_resource * p_res,unsigned buffer_offset,unsigned buffer_size)4302 iris_create_stream_output_target(struct pipe_context *ctx,
4303                                  struct pipe_resource *p_res,
4304                                  unsigned buffer_offset,
4305                                  unsigned buffer_size)
4306 {
4307    struct iris_resource *res = (void *) p_res;
4308    struct iris_stream_output_target *cso = calloc(1, sizeof(*cso));
4309    if (!cso)
4310       return NULL;
4311 
4312    res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
4313 
4314    pipe_reference_init(&cso->base.reference, 1);
4315    pipe_resource_reference(&cso->base.buffer, p_res);
4316    cso->base.buffer_offset = buffer_offset;
4317    cso->base.buffer_size = buffer_size;
4318    cso->base.context = ctx;
4319 
4320    util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset,
4321                   buffer_offset + buffer_size);
4322 
4323    return &cso->base;
4324 }
4325 
4326 static void
iris_stream_output_target_destroy(struct pipe_context * ctx,struct pipe_stream_output_target * state)4327 iris_stream_output_target_destroy(struct pipe_context *ctx,
4328                                   struct pipe_stream_output_target *state)
4329 {
4330    struct iris_stream_output_target *cso = (void *) state;
4331 
4332    pipe_resource_reference(&cso->base.buffer, NULL);
4333    pipe_resource_reference(&cso->offset.res, NULL);
4334 
4335    free(cso);
4336 }
4337 
4338 /**
4339  * The pipe->set_stream_output_targets() driver hook.
4340  *
4341  * At this point, we know which targets are bound to a particular index,
4342  * and also whether we want to append or start over.  We can finish the
4343  * 3DSTATE_SO_BUFFER packets we started earlier.
4344  */
4345 static void
iris_set_stream_output_targets(struct pipe_context * ctx,unsigned num_targets,struct pipe_stream_output_target ** targets,const unsigned * offsets)4346 iris_set_stream_output_targets(struct pipe_context *ctx,
4347                                unsigned num_targets,
4348                                struct pipe_stream_output_target **targets,
4349                                const unsigned *offsets)
4350 {
4351    struct iris_context *ice = (struct iris_context *) ctx;
4352    struct iris_genx_state *genx = ice->state.genx;
4353    uint32_t *so_buffers = genx->so_buffers;
4354    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
4355 
4356    const bool active = num_targets > 0;
4357    if (ice->state.streamout_active != active) {
4358       ice->state.streamout_active = active;
4359       ice->state.dirty |= IRIS_DIRTY_STREAMOUT;
4360 
4361       /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
4362        * it's a non-pipelined command.  If we're switching streamout on, we
4363        * may have missed emitting it earlier, so do so now.  (We're already
4364        * taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
4365        */
4366       if (active) {
4367          ice->state.dirty |= IRIS_DIRTY_SO_DECL_LIST;
4368       } else {
4369          for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4370             struct iris_stream_output_target *tgt =
4371                (void *) ice->state.so_target[i];
4372 
4373             if (tgt)
4374                iris_dirty_for_history(ice, (void *)tgt->base.buffer);
4375          }
4376       }
4377    }
4378 
4379    for (int i = 0; i < 4; i++) {
4380       pipe_so_target_reference(&ice->state.so_target[i],
4381                                i < num_targets ? targets[i] : NULL);
4382    }
4383 
4384    /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
4385    if (!active)
4386       return;
4387 
4388    for (unsigned i = 0; i < 4; i++,
4389         so_buffers += GENX(3DSTATE_SO_BUFFER_length)) {
4390 
4391       struct iris_stream_output_target *tgt = (void *) ice->state.so_target[i];
4392       unsigned offset = offsets[i];
4393 
4394       if (!tgt) {
4395          iris_pack_command(GENX(3DSTATE_SO_BUFFER), so_buffers, sob) {
4396 #if GFX_VER < 12
4397             sob.SOBufferIndex = i;
4398 #else
4399             sob._3DCommandOpcode = 0;
4400             sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + i;
4401 #endif
4402             sob.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
4403          }
4404          continue;
4405       }
4406 
4407       if (!tgt->offset.res)
4408          upload_state(ctx->const_uploader, &tgt->offset, sizeof(uint32_t), 4);
4409 
4410       struct iris_resource *res = (void *) tgt->base.buffer;
4411 
4412       /* Note that offsets[i] will either be 0, causing us to zero
4413        * the value in the buffer, or 0xFFFFFFFF, which happens to mean
4414        * "continue appending at the existing offset."
4415        */
4416       assert(offset == 0 || offset == 0xFFFFFFFF);
4417 
4418       /* When we're first called with an offset of 0, we want the next
4419        * 3DSTATE_SO_BUFFER packets to reset the offset to the beginning.
4420        * Any further times we emit those packets, we want to use 0xFFFFFFFF
4421        * to continue appending from the current offset.
4422        *
4423        * Note that we might be called by Begin (offset = 0), Pause, then
4424        * Resume (offset = 0xFFFFFFFF) before ever drawing (where these
4425        * commands will actually be sent to the GPU).  In this case, we
4426        * don't want to append - we still want to do our initial zeroing.
4427        */
4428       if (offset == 0)
4429          tgt->zero_offset = true;
4430 
4431       iris_pack_command(GENX(3DSTATE_SO_BUFFER), so_buffers, sob) {
4432 #if GFX_VER < 12
4433          sob.SOBufferIndex = i;
4434 #else
4435          sob._3DCommandOpcode = 0;
4436          sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + i;
4437 #endif
4438          sob.SurfaceBaseAddress =
4439             rw_bo(NULL, res->bo->address + tgt->base.buffer_offset,
4440                   IRIS_DOMAIN_OTHER_WRITE);
4441          sob.SOBufferEnable = true;
4442          sob.StreamOffsetWriteEnable = true;
4443          sob.StreamOutputBufferOffsetAddressEnable = true;
4444          sob.MOCS = iris_mocs(res->bo, &screen->isl_dev,
4445                               ISL_SURF_USAGE_STREAM_OUT_BIT);
4446 
4447          sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1;
4448          sob.StreamOutputBufferOffsetAddress =
4449             rw_bo(NULL, iris_resource_bo(tgt->offset.res)->address +
4450                         tgt->offset.offset, IRIS_DOMAIN_OTHER_WRITE);
4451          sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */
4452       }
4453    }
4454 
4455    ice->state.dirty |= IRIS_DIRTY_SO_BUFFERS;
4456 }
4457 
4458 /**
4459  * An iris-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
4460  * 3DSTATE_STREAMOUT packets.
4461  *
4462  * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
4463  * hardware to record.  We can create it entirely based on the shader, with
4464  * no dynamic state dependencies.
4465  *
4466  * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
4467  * state-based settings.  We capture the shader-related ones here, and merge
4468  * the rest in at draw time.
4469  */
4470 static uint32_t *
iris_create_so_decl_list(const struct pipe_stream_output_info * info,const struct intel_vue_map * vue_map)4471 iris_create_so_decl_list(const struct pipe_stream_output_info *info,
4472                          const struct intel_vue_map *vue_map)
4473 {
4474    struct GENX(SO_DECL) so_decl[PIPE_MAX_VERTEX_STREAMS][128];
4475    int buffer_mask[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4476    int next_offset[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4477    int decls[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4478    int max_decls = 0;
4479    STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= PIPE_MAX_SO_OUTPUTS);
4480 
4481    memset(so_decl, 0, sizeof(so_decl));
4482 
4483    /* Construct the list of SO_DECLs to be emitted.  The formatting of the
4484     * command feels strange -- each dword pair contains a SO_DECL per stream.
4485     */
4486    for (unsigned i = 0; i < info->num_outputs; i++) {
4487       const struct pipe_stream_output *output = &info->output[i];
4488       const int buffer = output->output_buffer;
4489       const int varying = output->register_index;
4490       const unsigned stream_id = output->stream;
4491       assert(stream_id < PIPE_MAX_VERTEX_STREAMS);
4492 
4493       buffer_mask[stream_id] |= 1 << buffer;
4494 
4495       assert(vue_map->varying_to_slot[varying] >= 0);
4496 
4497       /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
4498        * array.  Instead, it simply increments DstOffset for the following
4499        * input by the number of components that should be skipped.
4500        *
4501        * Our hardware is unusual in that it requires us to program SO_DECLs
4502        * for fake "hole" components, rather than simply taking the offset
4503        * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
4504        * program as many size = 4 holes as we can, then a final hole to
4505        * accommodate the final 1, 2, or 3 remaining.
4506        */
4507       int skip_components = output->dst_offset - next_offset[buffer];
4508 
4509       while (skip_components > 0) {
4510          so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4511             .HoleFlag = 1,
4512             .OutputBufferSlot = output->output_buffer,
4513             .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
4514          };
4515          skip_components -= 4;
4516       }
4517 
4518       next_offset[buffer] = output->dst_offset + output->num_components;
4519 
4520       so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4521          .OutputBufferSlot = output->output_buffer,
4522          .RegisterIndex = vue_map->varying_to_slot[varying],
4523          .ComponentMask =
4524             ((1 << output->num_components) - 1) << output->start_component,
4525       };
4526 
4527       if (decls[stream_id] > max_decls)
4528          max_decls = decls[stream_id];
4529    }
4530 
4531    unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
4532    uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
4533    uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
4534 
4535    iris_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
4536       int urb_entry_read_offset = 0;
4537       int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
4538          urb_entry_read_offset;
4539 
4540       /* We always read the whole vertex.  This could be reduced at some
4541        * point by reading less and offsetting the register index in the
4542        * SO_DECLs.
4543        */
4544       sol.Stream0VertexReadOffset = urb_entry_read_offset;
4545       sol.Stream0VertexReadLength = urb_entry_read_length - 1;
4546       sol.Stream1VertexReadOffset = urb_entry_read_offset;
4547       sol.Stream1VertexReadLength = urb_entry_read_length - 1;
4548       sol.Stream2VertexReadOffset = urb_entry_read_offset;
4549       sol.Stream2VertexReadLength = urb_entry_read_length - 1;
4550       sol.Stream3VertexReadOffset = urb_entry_read_offset;
4551       sol.Stream3VertexReadLength = urb_entry_read_length - 1;
4552 
4553       /* Set buffer pitches; 0 means unbound. */
4554       sol.Buffer0SurfacePitch = 4 * info->stride[0];
4555       sol.Buffer1SurfacePitch = 4 * info->stride[1];
4556       sol.Buffer2SurfacePitch = 4 * info->stride[2];
4557       sol.Buffer3SurfacePitch = 4 * info->stride[3];
4558    }
4559 
4560    iris_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
4561       list.DWordLength = 3 + 2 * max_decls - 2;
4562       list.StreamtoBufferSelects0 = buffer_mask[0];
4563       list.StreamtoBufferSelects1 = buffer_mask[1];
4564       list.StreamtoBufferSelects2 = buffer_mask[2];
4565       list.StreamtoBufferSelects3 = buffer_mask[3];
4566       list.NumEntries0 = decls[0];
4567       list.NumEntries1 = decls[1];
4568       list.NumEntries2 = decls[2];
4569       list.NumEntries3 = decls[3];
4570    }
4571 
4572    for (int i = 0; i < max_decls; i++) {
4573       iris_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
4574          entry.Stream0Decl = so_decl[0][i];
4575          entry.Stream1Decl = so_decl[1][i];
4576          entry.Stream2Decl = so_decl[2][i];
4577          entry.Stream3Decl = so_decl[3][i];
4578       }
4579    }
4580 
4581    return map;
4582 }
4583 
4584 static inline int
iris_compute_first_urb_slot_required(uint64_t inputs_read,const struct intel_vue_map * prev_stage_vue_map)4585 iris_compute_first_urb_slot_required(uint64_t inputs_read,
4586                                      const struct intel_vue_map *prev_stage_vue_map)
4587 {
4588 #if GFX_VER >= 9
4589    return brw_compute_first_urb_slot_required(inputs_read, prev_stage_vue_map);
4590 #else
4591    return elk_compute_first_urb_slot_required(inputs_read, prev_stage_vue_map);
4592 #endif
4593 }
4594 
4595 static void
iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots,const struct intel_vue_map * last_vue_map,bool two_sided_color,unsigned * out_offset,unsigned * out_length)4596 iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots,
4597                                    const struct intel_vue_map *last_vue_map,
4598                                    bool two_sided_color,
4599                                    unsigned *out_offset,
4600                                    unsigned *out_length)
4601 {
4602    /* The compiler computes the first URB slot without considering COL/BFC
4603     * swizzling (because it doesn't know whether it's enabled), so we need
4604     * to do that here too.  This may result in a smaller offset, which
4605     * should be safe.
4606     */
4607    const unsigned first_slot =
4608       iris_compute_first_urb_slot_required(fs_input_slots, last_vue_map);
4609 
4610    /* This becomes the URB read offset (counted in pairs of slots). */
4611    assert(first_slot % 2 == 0);
4612    *out_offset = first_slot / 2;
4613 
4614    /* We need to adjust the inputs read to account for front/back color
4615     * swizzling, as it can make the URB length longer.
4616     */
4617    for (int c = 0; c <= 1; c++) {
4618       if (fs_input_slots & (VARYING_BIT_COL0 << c)) {
4619          /* If two sided color is enabled, the fragment shader's gl_Color
4620           * (COL0) input comes from either the gl_FrontColor (COL0) or
4621           * gl_BackColor (BFC0) input varyings.  Mark BFC as used, too.
4622           */
4623          if (two_sided_color)
4624             fs_input_slots |= (VARYING_BIT_BFC0 << c);
4625 
4626          /* If front color isn't written, we opt to give them back color
4627           * instead of an undefined value.  Switch from COL to BFC.
4628           */
4629          if (last_vue_map->varying_to_slot[VARYING_SLOT_COL0 + c] == -1) {
4630             fs_input_slots &= ~(VARYING_BIT_COL0 << c);
4631             fs_input_slots |= (VARYING_BIT_BFC0 << c);
4632          }
4633       }
4634    }
4635 
4636    /* Compute the minimum URB Read Length necessary for the FS inputs.
4637     *
4638     * From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
4639     * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
4640     *
4641     * "This field should be set to the minimum length required to read the
4642     *  maximum source attribute.  The maximum source attribute is indicated
4643     *  by the maximum value of the enabled Attribute # Source Attribute if
4644     *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
4645     *  enable is not set.
4646     *  read_length = ceiling((max_source_attr + 1) / 2)
4647     *
4648     *  [errata] Corruption/Hang possible if length programmed larger than
4649     *  recommended"
4650     *
4651     * Similar text exists for Ivy Bridge.
4652     *
4653     * We find the last URB slot that's actually read by the FS.
4654     */
4655    unsigned last_read_slot = last_vue_map->num_slots - 1;
4656    while (last_read_slot > first_slot && !(fs_input_slots &
4657           (1ull << last_vue_map->slot_to_varying[last_read_slot])))
4658       --last_read_slot;
4659 
4660    /* The URB read length is the difference of the two, counted in pairs. */
4661    *out_length = DIV_ROUND_UP(last_read_slot - first_slot + 1, 2);
4662 }
4663 
4664 static void
iris_emit_sbe_swiz(struct iris_batch * batch,const struct iris_context * ice,const struct intel_vue_map * vue_map,unsigned urb_read_offset,unsigned sprite_coord_enables)4665 iris_emit_sbe_swiz(struct iris_batch *batch,
4666                    const struct iris_context *ice,
4667                    const struct intel_vue_map *vue_map,
4668                    unsigned urb_read_offset,
4669                    unsigned sprite_coord_enables)
4670 {
4671    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = {};
4672    const struct iris_fs_data *fs_data =
4673       iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
4674    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4675 
4676    /* XXX: this should be generated when putting programs in place */
4677 
4678    for (uint8_t idx = 0; idx < fs_data->urb_setup_attribs_count; idx++) {
4679       const uint8_t fs_attr = fs_data->urb_setup_attribs[idx];
4680       const int input_index = fs_data->urb_setup[fs_attr];
4681       if (input_index < 0 || input_index >= 16)
4682          continue;
4683 
4684       struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr =
4685          &attr_overrides[input_index];
4686       int slot = vue_map->varying_to_slot[fs_attr];
4687 
4688       /* Viewport and Layer are stored in the VUE header.  We need to override
4689        * them to zero if earlier stages didn't write them, as GL requires that
4690        * they read back as zero when not explicitly set.
4691        */
4692       switch (fs_attr) {
4693       case VARYING_SLOT_VIEWPORT:
4694       case VARYING_SLOT_LAYER:
4695          attr->ComponentOverrideX = true;
4696          attr->ComponentOverrideW = true;
4697          attr->ConstantSource = CONST_0000;
4698 
4699          if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
4700             attr->ComponentOverrideY = true;
4701          if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
4702             attr->ComponentOverrideZ = true;
4703          continue;
4704 
4705       default:
4706          break;
4707       }
4708 
4709       if (sprite_coord_enables & (1 << input_index))
4710          continue;
4711 
4712       /* If there was only a back color written but not front, use back
4713        * as the color instead of undefined.
4714        */
4715       if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
4716          slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
4717       if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
4718          slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
4719 
4720       /* Not written by the previous stage - undefined. */
4721       if (slot == -1) {
4722          attr->ComponentOverrideX = true;
4723          attr->ComponentOverrideY = true;
4724          attr->ComponentOverrideZ = true;
4725          attr->ComponentOverrideW = true;
4726          attr->ConstantSource = CONST_0001_FLOAT;
4727          continue;
4728       }
4729 
4730       /* Compute the location of the attribute relative to the read offset,
4731        * which is counted in 256-bit increments (two 128-bit VUE slots).
4732        */
4733       const int source_attr = slot - 2 * urb_read_offset;
4734       assert(source_attr >= 0 && source_attr <= 32);
4735       attr->SourceAttribute = source_attr;
4736 
4737       /* If we are doing two-sided color, and the VUE slot following this one
4738        * represents a back-facing color, then we need to instruct the SF unit
4739        * to do back-facing swizzling.
4740        */
4741       if (cso_rast->light_twoside &&
4742           ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
4743             vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
4744            (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
4745             vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1)))
4746          attr->SwizzleSelect = INPUTATTR_FACING;
4747    }
4748 
4749    iris_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {
4750       for (int i = 0; i < 16; i++)
4751          sbes.Attribute[i] = attr_overrides[i];
4752    }
4753 }
4754 
4755 static bool
iris_is_drawing_points(const struct iris_context * ice)4756 iris_is_drawing_points(const struct iris_context *ice)
4757 {
4758    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4759 
4760    if (cso_rast->fill_mode_point) {
4761       return true;
4762    }
4763 
4764    if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
4765       const struct iris_gs_data *gs_data =
4766          iris_gs_data(ice->shaders.prog[MESA_SHADER_GEOMETRY]);
4767       return gs_data->output_topology == _3DPRIM_POINTLIST;
4768    } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
4769       const struct iris_tes_data *tes_data =
4770          iris_tes_data(ice->shaders.prog[MESA_SHADER_TESS_EVAL]);
4771       return tes_data->output_topology == INTEL_TESS_OUTPUT_TOPOLOGY_POINT;
4772    } else {
4773       return ice->state.prim_mode == MESA_PRIM_POINTS;
4774    }
4775 }
4776 
4777 static unsigned
iris_calculate_point_sprite_overrides(const struct iris_fs_data * fs_data,const struct iris_rasterizer_state * cso)4778 iris_calculate_point_sprite_overrides(const struct iris_fs_data *fs_data,
4779                                       const struct iris_rasterizer_state *cso)
4780 {
4781    unsigned overrides = 0;
4782 
4783    if (fs_data->urb_setup[VARYING_SLOT_PNTC] != -1)
4784       overrides |= 1 << fs_data->urb_setup[VARYING_SLOT_PNTC];
4785 
4786    for (int i = 0; i < 8; i++) {
4787       if ((cso->sprite_coord_enable & (1 << i)) &&
4788           fs_data->urb_setup[VARYING_SLOT_TEX0 + i] != -1)
4789          overrides |= 1 << fs_data->urb_setup[VARYING_SLOT_TEX0 + i];
4790    }
4791 
4792    return overrides;
4793 }
4794 
4795 static void
iris_emit_sbe(struct iris_batch * batch,const struct iris_context * ice)4796 iris_emit_sbe(struct iris_batch *batch, const struct iris_context *ice)
4797 {
4798    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4799    const struct iris_fs_data *fs_data =
4800       iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
4801    const struct intel_vue_map *last_vue_map =
4802       &iris_vue_data(ice->shaders.last_vue_shader)->vue_map;
4803 
4804    unsigned urb_read_offset, urb_read_length;
4805    iris_compute_sbe_urb_read_interval(fs_data->inputs,
4806                                       last_vue_map,
4807                                       cso_rast->light_twoside,
4808                                       &urb_read_offset, &urb_read_length);
4809 
4810    unsigned sprite_coord_overrides =
4811       iris_is_drawing_points(ice) ?
4812       iris_calculate_point_sprite_overrides(fs_data, cso_rast) : 0;
4813 
4814    iris_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
4815       sbe.AttributeSwizzleEnable = true;
4816       sbe.NumberofSFOutputAttributes = fs_data->num_varying_inputs;
4817       sbe.PointSpriteTextureCoordinateOrigin = cso_rast->sprite_coord_mode;
4818       sbe.VertexURBEntryReadOffset = urb_read_offset;
4819       sbe.VertexURBEntryReadLength = urb_read_length;
4820       sbe.ForceVertexURBEntryReadOffset = true;
4821       sbe.ForceVertexURBEntryReadLength = true;
4822       sbe.ConstantInterpolationEnable = fs_data->flat_inputs;
4823       sbe.PointSpriteTextureCoordinateEnable = sprite_coord_overrides;
4824 #if GFX_VER >= 9
4825       for (int i = 0; i < 32; i++) {
4826          sbe.AttributeActiveComponentFormat[i] = ACTIVE_COMPONENT_XYZW;
4827       }
4828 #endif
4829 
4830       /* Ask the hardware to supply PrimitiveID if the fragment shader
4831        * reads it but a previous stage didn't write one.
4832        */
4833       if ((fs_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
4834           last_vue_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1) {
4835          sbe.PrimitiveIDOverrideAttributeSelect =
4836             fs_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
4837          sbe.PrimitiveIDOverrideComponentX = true;
4838          sbe.PrimitiveIDOverrideComponentY = true;
4839          sbe.PrimitiveIDOverrideComponentZ = true;
4840          sbe.PrimitiveIDOverrideComponentW = true;
4841       }
4842    }
4843 
4844    iris_emit_sbe_swiz(batch, ice, last_vue_map, urb_read_offset,
4845                       sprite_coord_overrides);
4846 }
4847 
4848 /* ------------------------------------------------------------------- */
4849 
4850 /**
4851  * Populate VS program key fields based on the current state.
4852  */
4853 static void
iris_populate_vs_key(const struct iris_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct iris_vs_prog_key * key)4854 iris_populate_vs_key(const struct iris_context *ice,
4855                      const struct shader_info *info,
4856                      gl_shader_stage last_stage,
4857                      struct iris_vs_prog_key *key)
4858 {
4859    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4860 
4861    if (info->clip_distance_array_size == 0 &&
4862        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4863        last_stage == MESA_SHADER_VERTEX)
4864       key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4865 }
4866 
4867 /**
4868  * Populate TCS program key fields based on the current state.
4869  */
4870 static void
iris_populate_tcs_key(const struct iris_context * ice,struct iris_tcs_prog_key * key)4871 iris_populate_tcs_key(const struct iris_context *ice,
4872                       struct iris_tcs_prog_key *key)
4873 {
4874 }
4875 
4876 /**
4877  * Populate TES program key fields based on the current state.
4878  */
4879 static void
iris_populate_tes_key(const struct iris_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct iris_tes_prog_key * key)4880 iris_populate_tes_key(const struct iris_context *ice,
4881                       const struct shader_info *info,
4882                       gl_shader_stage last_stage,
4883                       struct iris_tes_prog_key *key)
4884 {
4885    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4886 
4887    if (info->clip_distance_array_size == 0 &&
4888        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4889        last_stage == MESA_SHADER_TESS_EVAL)
4890       key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4891 }
4892 
4893 /**
4894  * Populate GS program key fields based on the current state.
4895  */
4896 static void
iris_populate_gs_key(const struct iris_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct iris_gs_prog_key * key)4897 iris_populate_gs_key(const struct iris_context *ice,
4898                      const struct shader_info *info,
4899                      gl_shader_stage last_stage,
4900                      struct iris_gs_prog_key *key)
4901 {
4902    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4903 
4904    if (info->clip_distance_array_size == 0 &&
4905        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4906        last_stage == MESA_SHADER_GEOMETRY)
4907       key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4908 }
4909 
4910 /**
4911  * Populate FS program key fields based on the current state.
4912  */
4913 static void
iris_populate_fs_key(const struct iris_context * ice,const struct shader_info * info,struct iris_fs_prog_key * key)4914 iris_populate_fs_key(const struct iris_context *ice,
4915                      const struct shader_info *info,
4916                      struct iris_fs_prog_key *key)
4917 {
4918    struct iris_screen *screen = (void *) ice->ctx.screen;
4919    const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
4920    const struct iris_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
4921    const struct iris_rasterizer_state *rast = ice->state.cso_rast;
4922    const struct iris_blend_state *blend = ice->state.cso_blend;
4923 
4924    key->nr_color_regions = fb->nr_cbufs;
4925 
4926    key->clamp_fragment_color = rast->clamp_fragment_color;
4927 
4928    key->alpha_to_coverage = blend->alpha_to_coverage;
4929 
4930    key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->alpha_enabled;
4931 
4932    key->flat_shade = rast->flatshade &&
4933       (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
4934 
4935    key->persample_interp = rast->force_persample_interp;
4936    key->multisample_fbo = rast->multisample && fb->samples > 1;
4937 
4938    key->coherent_fb_fetch = GFX_VER >= 9 && GFX_VER < 20;
4939 
4940    key->force_dual_color_blend =
4941       screen->driconf.dual_color_blend_by_location &&
4942       (blend->blend_enables & 1) && blend->dual_color_blending;
4943 }
4944 
4945 static void
iris_populate_cs_key(const struct iris_context * ice,struct iris_cs_prog_key * key)4946 iris_populate_cs_key(const struct iris_context *ice,
4947                      struct iris_cs_prog_key *key)
4948 {
4949 }
4950 
4951 static inline uint32_t
encode_sampler_count(const struct iris_compiled_shader * shader)4952 encode_sampler_count(const struct iris_compiled_shader *shader)
4953 {
4954    /* We can potentially have way more than 32 samplers and that's ok.
4955     * However, the 3DSTATE_XS packets only have 3 bits to specify how
4956     * many to pre-fetch and all values above 4 are marked reserved.
4957     */
4958    uint32_t count = util_last_bit64(shader->bt.samplers_used_mask);
4959    return DIV_ROUND_UP(CLAMP(count, 0, 16), 4);
4960 }
4961 
4962 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage)                   \
4963    pkt.KernelStartPointer = KSP(shader);                                  \
4964    pkt.BindingTableEntryCount = shader->bt.size_bytes / 4;                \
4965    pkt.SamplerCount = encode_sampler_count(shader);                       \
4966    pkt.FloatingPointMode = shader->use_alt_mode;                          \
4967                                                                           \
4968    pkt.DispatchGRFStartRegisterForURBData =                               \
4969       shader->dispatch_grf_start_reg;                                     \
4970    pkt.prefix##URBEntryReadLength = vue_data->urb_read_length;            \
4971    pkt.prefix##URBEntryReadOffset = 0;                                    \
4972                                                                           \
4973    pkt.StatisticsEnable = true;                                           \
4974    pkt.Enable           = true;                                           \
4975                                                                           \
4976    if (shader->total_scratch) {                                           \
4977       INIT_THREAD_SCRATCH_SIZE(pkt)                                       \
4978    }
4979 
4980 /* Note that on Gfx12HP we pass a scratch space surface state offset
4981  * shifted by 2 relative to the value specified on the BSpec, since
4982  * that allows the compiler to save a shift instruction while
4983  * constructing the extended descriptor for SS addressing.  That
4984  * worked because we limit the scratch surface state pool to 8 MB and
4985  * because we relied on the legacy (ExBSO=0) encoding of the extended
4986  * descriptor in order to save the shift, which is no longer supported
4987  * for the UGM shared function on Xe2 platforms, so we no longer
4988  * attempt to do that trick.
4989  */
4990 #define SCRATCH_SPACE_BUFFER_SHIFT (GFX_VER >= 20 ? 6 : 4)
4991 
4992 #if GFX_VERx10 >= 125
4993 #define INIT_THREAD_SCRATCH_SIZE(pkt)
4994 #define MERGE_SCRATCH_ADDR(name)                                          \
4995 {                                                                         \
4996    uint32_t pkt2[GENX(name##_length)] = {0};                              \
4997    _iris_pack_command(batch, GENX(name), pkt2, p) {                       \
4998       p.ScratchSpaceBuffer = scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;  \
4999    }                                                                      \
5000    iris_emit_merge(batch, pkt, pkt2, GENX(name##_length));                \
5001 }
5002 #else
5003 #define INIT_THREAD_SCRATCH_SIZE(pkt)                                     \
5004    pkt.PerThreadScratchSpace = ffs(shader->total_scratch) - 11;
5005 #define MERGE_SCRATCH_ADDR(name)                                          \
5006 {                                                                         \
5007    uint32_t pkt2[GENX(name##_length)] = {0};                              \
5008    _iris_pack_command(batch, GENX(name), pkt2, p) {                       \
5009       p.ScratchSpaceBasePointer =                                         \
5010          rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);                     \
5011    }                                                                      \
5012    iris_emit_merge(batch, pkt, pkt2, GENX(name##_length));                \
5013 }
5014 #endif
5015 
5016 
5017 /**
5018  * Encode most of 3DSTATE_VS based on the compiled shader.
5019  */
5020 static void
iris_store_vs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5021 iris_store_vs_state(const struct intel_device_info *devinfo,
5022                     struct iris_compiled_shader *shader)
5023 {
5024    struct iris_vue_data *vue_data = iris_vue_data(shader);
5025 
5026    iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) {
5027       INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
5028       vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
5029 #if GFX_VER < 20
5030       vs.SIMD8DispatchEnable = true;
5031 #endif
5032       vs.UserClipDistanceCullTestEnableBitmask =
5033          vue_data->cull_distance_mask;
5034    }
5035 }
5036 
5037 /**
5038  * Encode most of 3DSTATE_HS based on the compiled shader.
5039  */
5040 static void
iris_store_tcs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5041 iris_store_tcs_state(const struct intel_device_info *devinfo,
5042                      struct iris_compiled_shader *shader)
5043 {
5044    struct iris_tcs_data *tcs_data = iris_tcs_data(shader);
5045    struct iris_vue_data *vue_data = &tcs_data->base;
5046 
5047    iris_pack_command(GENX(3DSTATE_HS), shader->derived_data, hs) {
5048       INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
5049 
5050 #if GFX_VER >= 12
5051       /* Wa_1604578095:
5052        *
5053        *    Hang occurs when the number of max threads is less than 2 times
5054        *    the number of instance count. The number of max threads must be
5055        *    more than 2 times the number of instance count.
5056        */
5057       assert((devinfo->max_tcs_threads / 2) > tcs_data->instances);
5058       hs.DispatchGRFStartRegisterForURBData = shader->dispatch_grf_start_reg & 0x1f;
5059       hs.DispatchGRFStartRegisterForURBData5 = shader->dispatch_grf_start_reg >> 5;
5060 #endif
5061 
5062       hs.InstanceCount = tcs_data->instances - 1;
5063       hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
5064       hs.IncludeVertexHandles = true;
5065 
5066 #if GFX_VER == 12
5067       /* Patch Count threshold specifies the maximum number of patches that
5068        * will be accumulated before a thread dispatch is forced.
5069        */
5070       hs.PatchCountThreshold = tcs_data->patch_count_threshold;
5071 #endif
5072 
5073 #if GFX_VER >= 9
5074 #if GFX_VER < 20
5075       hs.DispatchMode = vue_data->dispatch_mode;
5076 #endif
5077       hs.IncludePrimitiveID = tcs_data->include_primitive_id;
5078 #endif
5079    }
5080 }
5081 
5082 /**
5083  * Encode 3DSTATE_TE and most of 3DSTATE_DS based on the compiled shader.
5084  */
5085 static void
iris_store_tes_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5086 iris_store_tes_state(const struct intel_device_info *devinfo,
5087                      struct iris_compiled_shader *shader)
5088 {
5089    struct iris_tes_data *tes_data = iris_tes_data(shader);
5090    struct iris_vue_data *vue_data = &tes_data->base;
5091 
5092    uint32_t *ds_state = (void *) shader->derived_data;
5093    uint32_t *te_state = ds_state + GENX(3DSTATE_DS_length);
5094 
5095    iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
5096       INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
5097 
5098       ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
5099       ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
5100       ds.ComputeWCoordinateEnable =
5101          tes_data->domain == INTEL_TESS_DOMAIN_TRI;
5102 
5103 #if GFX_VER >= 12
5104       ds.PrimitiveIDNotRequired = !tes_data->include_primitive_id;
5105 #endif
5106       ds.UserClipDistanceCullTestEnableBitmask =
5107          vue_data->cull_distance_mask;
5108    }
5109 
5110    iris_pack_command(GENX(3DSTATE_TE), te_state, te) {
5111       te.Partitioning = tes_data->partitioning;
5112 #if GFX_VER >= 20
5113       te.NumberOfRegionsPerPatch = 2;
5114 #endif
5115       te.OutputTopology = tes_data->output_topology;
5116       te.TEDomain = tes_data->domain;
5117       te.TEEnable = true;
5118       te.MaximumTessellationFactorOdd = 63.0;
5119       te.MaximumTessellationFactorNotOdd = 64.0;
5120 #if GFX_VERx10 >= 125
5121       STATIC_ASSERT(TEDMODE_OFF == 0);
5122       if (intel_needs_workaround(devinfo, 14015055625)) {
5123          te.TessellationDistributionMode = TEDMODE_OFF;
5124       } else if (intel_needs_workaround(devinfo, 22012699309)) {
5125          te.TessellationDistributionMode = TEDMODE_RR_STRICT;
5126       } else {
5127          te.TessellationDistributionMode = TEDMODE_RR_FREE;
5128       }
5129 
5130    #if GFX_VER >= 20
5131       te.TessellationDistributionLevel = TEDLEVEL_REGION;
5132    #else
5133       te.TessellationDistributionLevel = TEDLEVEL_PATCH;
5134    #endif
5135       /* 64_TRIANGLES */
5136       te.SmallPatchThreshold = 3;
5137       /* 1K_TRIANGLES */
5138       te.TargetBlockSize = 8;
5139       /* 1K_TRIANGLES */
5140       te.LocalBOPAccumulatorThreshold = 1;
5141 #endif
5142    }
5143 }
5144 
5145 /**
5146  * Encode most of 3DSTATE_GS based on the compiled shader.
5147  */
5148 static void
iris_store_gs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5149 iris_store_gs_state(const struct intel_device_info *devinfo,
5150                     struct iris_compiled_shader *shader)
5151 {
5152    struct iris_gs_data *gs_data = iris_gs_data(shader);
5153    struct iris_vue_data *vue_data = &gs_data->base;
5154 
5155    iris_pack_command(GENX(3DSTATE_GS), shader->derived_data, gs) {
5156       INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
5157 
5158       gs.OutputVertexSize = gs_data->output_vertex_size_hwords * 2 - 1;
5159       gs.OutputTopology = gs_data->output_topology;
5160       gs.ControlDataHeaderSize = gs_data->control_data_header_size_hwords;
5161       gs.InstanceControl = gs_data->invocations - 1;
5162 #if GFX_VER < 20
5163       gs.DispatchMode = DISPATCH_MODE_SIMD8;
5164 #endif
5165       gs.IncludePrimitiveID = gs_data->include_primitive_id;
5166       gs.ControlDataFormat = gs_data->control_data_format;
5167       gs.ReorderMode = TRAILING;
5168       gs.ExpectedVertexCount = gs_data->vertices_in;
5169       gs.MaximumNumberofThreads =
5170          GFX_VER == 8 ? (devinfo->max_gs_threads / 2 - 1)
5171                       : (devinfo->max_gs_threads - 1);
5172 
5173       if (gs_data->static_vertex_count != -1) {
5174          gs.StaticOutput = true;
5175          gs.StaticOutputVertexCount = gs_data->static_vertex_count;
5176       }
5177       gs.IncludeVertexHandles = vue_data->include_vue_handles;
5178 
5179       gs.UserClipDistanceCullTestEnableBitmask = vue_data->cull_distance_mask;
5180 
5181       const int urb_entry_write_offset = 1;
5182       const uint32_t urb_entry_output_length =
5183          DIV_ROUND_UP(vue_data->vue_map.num_slots, 2) - urb_entry_write_offset;
5184 
5185       gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
5186       gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
5187    }
5188 }
5189 
5190 /**
5191  * Encode most of 3DSTATE_PS and 3DSTATE_PS_EXTRA based on the shader.
5192  */
5193 static void
iris_store_fs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5194 iris_store_fs_state(const struct intel_device_info *devinfo,
5195                     struct iris_compiled_shader *shader)
5196 {
5197    struct iris_fs_data *fs_data = iris_fs_data(shader);
5198 
5199    uint32_t *ps_state = (void *) shader->derived_data;
5200    uint32_t *psx_state = ps_state + GENX(3DSTATE_PS_length);
5201 
5202    iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) {
5203       ps.VectorMaskEnable = fs_data->uses_vmask;
5204       ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
5205       ps.SamplerCount = encode_sampler_count(shader);
5206       ps.FloatingPointMode = shader->use_alt_mode;
5207       ps.MaximumNumberofThreadsPerPSD =
5208          devinfo->max_threads_per_psd - (GFX_VER == 8 ? 2 : 1);
5209 
5210 #if GFX_VER < 20
5211       ps.PushConstantEnable = devinfo->needs_null_push_constant_tbimr_workaround ||
5212                               shader->ubo_ranges[0].length > 0;
5213 #endif
5214 
5215       /* From the documentation for this packet:
5216        * "If the PS kernel does not need the Position XY Offsets to
5217        *  compute a Position Value, then this field should be programmed
5218        *  to POSOFFSET_NONE."
5219        *
5220        * "SW Recommendation: If the PS kernel needs the Position Offsets
5221        *  to compute a Position XY value, this field should match Position
5222        *  ZW Interpolation Mode to ensure a consistent position.xyzw
5223        *  computation."
5224        *
5225        * We only require XY sample offsets. So, this recommendation doesn't
5226        * look useful at the moment.  We might need this in future.
5227        */
5228       ps.PositionXYOffsetSelect =
5229          fs_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
5230 
5231       if (shader->total_scratch) {
5232          INIT_THREAD_SCRATCH_SIZE(ps);
5233       }
5234    }
5235 
5236    iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
5237       psx.PixelShaderValid = true;
5238       psx.PixelShaderComputedDepthMode = fs_data->computed_depth_mode;
5239       psx.PixelShaderKillsPixel = fs_data->uses_kill;
5240 #if GFX_VER < 20
5241       psx.AttributeEnable = fs_data->num_varying_inputs != 0;
5242 #endif
5243       psx.PixelShaderUsesSourceDepth = fs_data->uses_src_depth;
5244       psx.PixelShaderUsesSourceW = fs_data->uses_src_w;
5245       psx.PixelShaderIsPerSample = fs_data->is_per_sample;
5246       psx.oMaskPresenttoRenderTarget = fs_data->uses_omask;
5247 
5248 #if GFX_VER >= 9
5249 #if GFX_VER >= 20
5250       assert(!fs_data->pulls_bary);
5251 #else
5252       psx.PixelShaderPullsBary = fs_data->pulls_bary;
5253 #endif
5254       psx.PixelShaderComputesStencil = fs_data->computed_stencil;
5255 #endif
5256 
5257 #if GFX_VER >= 11
5258       psx.PixelShaderRequiresSubpixelSampleOffsets =
5259          fs_data->uses_sample_offsets;
5260       psx.PixelShaderRequiresNonPerspectiveBaryPlaneCoefficients =
5261          fs_data->uses_npc_bary_coefficients;
5262       psx.PixelShaderRequiresPerspectiveBaryPlaneCoefficients =
5263          fs_data->uses_pc_bary_coefficients;
5264       psx.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =
5265          fs_data->uses_depth_w_coefficients;
5266 #endif
5267    }
5268 }
5269 
5270 /**
5271  * Compute the size of the derived data (shader command packets).
5272  *
5273  * This must match the data written by the iris_store_xs_state() functions.
5274  */
5275 static void
iris_store_cs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5276 iris_store_cs_state(const struct intel_device_info *devinfo,
5277                     struct iris_compiled_shader *shader)
5278 {
5279    struct iris_cs_data *cs_data = iris_cs_data(shader);
5280    void *map = shader->derived_data;
5281 
5282    iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), map, desc) {
5283 #if GFX_VERx10 < 125
5284       desc.ConstantURBEntryReadLength = cs_data->push.per_thread.regs;
5285       desc.CrossThreadConstantDataReadLength =
5286          cs_data->push.cross_thread.regs;
5287 #else
5288       assert(cs_data->push.per_thread.regs == 0);
5289       assert(cs_data->push.cross_thread.regs == 0);
5290 #endif
5291 #if GFX_VERx10 <= 125
5292       desc.BarrierEnable = cs_data->uses_barrier;
5293 #endif
5294       /* Typically set to 0 to avoid prefetching on every thread dispatch. */
5295       desc.BindingTableEntryCount = devinfo->verx10 == 125 ?
5296          0 : MIN2(shader->bt.size_bytes / 4, 31);
5297       desc.SamplerCount = encode_sampler_count(shader);
5298       /* TODO: Check if we are missing workarounds and enable mid-thread
5299        * preemption.
5300        *
5301        * We still have issues with mid-thread preemption (it was already
5302        * disabled by the kernel on gfx11, due to missing workarounds). It's
5303        * possible that we are just missing some workarounds, and could enable
5304        * it later, but for now let's disable it to fix a GPU in compute in Car
5305        * Chase (and possibly more).
5306        */
5307 #if GFX_VER >= 20
5308       desc.ThreadPreemption = false;
5309 #elif GFX_VER >= 12
5310       desc.ThreadPreemptionDisable = true;
5311 #endif
5312    }
5313 }
5314 
5315 static unsigned
iris_derived_program_state_size(enum iris_program_cache_id cache_id)5316 iris_derived_program_state_size(enum iris_program_cache_id cache_id)
5317 {
5318    assert(cache_id <= IRIS_CACHE_BLORP);
5319 
5320    static const unsigned dwords[] = {
5321       [IRIS_CACHE_VS] = GENX(3DSTATE_VS_length),
5322       [IRIS_CACHE_TCS] = GENX(3DSTATE_HS_length),
5323       [IRIS_CACHE_TES] = GENX(3DSTATE_TE_length) + GENX(3DSTATE_DS_length),
5324       [IRIS_CACHE_GS] = GENX(3DSTATE_GS_length),
5325       [IRIS_CACHE_FS] =
5326          GENX(3DSTATE_PS_length) + GENX(3DSTATE_PS_EXTRA_length),
5327       [IRIS_CACHE_CS] = GENX(INTERFACE_DESCRIPTOR_DATA_length),
5328       [IRIS_CACHE_BLORP] = 0,
5329    };
5330 
5331    return sizeof(uint32_t) * dwords[cache_id];
5332 }
5333 
5334 /**
5335  * Create any state packets corresponding to the given shader stage
5336  * (i.e. 3DSTATE_VS) and save them as "derived data" in the shader variant.
5337  * This means that we can look up a program in the in-memory cache and
5338  * get most of the state packet without having to reconstruct it.
5339  */
5340 static void
iris_store_derived_program_state(const struct intel_device_info * devinfo,enum iris_program_cache_id cache_id,struct iris_compiled_shader * shader)5341 iris_store_derived_program_state(const struct intel_device_info *devinfo,
5342                                  enum iris_program_cache_id cache_id,
5343                                  struct iris_compiled_shader *shader)
5344 {
5345    switch (cache_id) {
5346    case IRIS_CACHE_VS:
5347       iris_store_vs_state(devinfo, shader);
5348       break;
5349    case IRIS_CACHE_TCS:
5350       iris_store_tcs_state(devinfo, shader);
5351       break;
5352    case IRIS_CACHE_TES:
5353       iris_store_tes_state(devinfo, shader);
5354       break;
5355    case IRIS_CACHE_GS:
5356       iris_store_gs_state(devinfo, shader);
5357       break;
5358    case IRIS_CACHE_FS:
5359       iris_store_fs_state(devinfo, shader);
5360       break;
5361    case IRIS_CACHE_CS:
5362       iris_store_cs_state(devinfo, shader);
5363       break;
5364    case IRIS_CACHE_BLORP:
5365       break;
5366    }
5367 }
5368 
5369 /* ------------------------------------------------------------------- */
5370 
5371 static const uint32_t push_constant_opcodes[] = {
5372    [MESA_SHADER_VERTEX]    = 21,
5373    [MESA_SHADER_TESS_CTRL] = 25, /* HS */
5374    [MESA_SHADER_TESS_EVAL] = 26, /* DS */
5375    [MESA_SHADER_GEOMETRY]  = 22,
5376    [MESA_SHADER_FRAGMENT]  = 23,
5377    [MESA_SHADER_COMPUTE]   = 0,
5378 };
5379 
5380 static uint32_t
use_null_surface(struct iris_batch * batch,struct iris_context * ice)5381 use_null_surface(struct iris_batch *batch, struct iris_context *ice)
5382 {
5383    struct iris_bo *state_bo = iris_resource_bo(ice->state.unbound_tex.res);
5384 
5385    iris_use_pinned_bo(batch, state_bo, false, IRIS_DOMAIN_NONE);
5386 
5387    return ice->state.unbound_tex.offset;
5388 }
5389 
5390 static uint32_t
use_null_fb_surface(struct iris_batch * batch,struct iris_context * ice)5391 use_null_fb_surface(struct iris_batch *batch, struct iris_context *ice)
5392 {
5393    /* If set_framebuffer_state() was never called, fall back to 1x1x1 */
5394    if (!ice->state.null_fb.res)
5395       return use_null_surface(batch, ice);
5396 
5397    struct iris_bo *state_bo = iris_resource_bo(ice->state.null_fb.res);
5398 
5399    iris_use_pinned_bo(batch, state_bo, false, IRIS_DOMAIN_NONE);
5400 
5401    return ice->state.null_fb.offset;
5402 }
5403 
5404 static uint32_t
surf_state_offset_for_aux(unsigned aux_modes,enum isl_aux_usage aux_usage)5405 surf_state_offset_for_aux(unsigned aux_modes,
5406                           enum isl_aux_usage aux_usage)
5407 {
5408    assert(aux_modes & (1 << aux_usage));
5409    return SURFACE_STATE_ALIGNMENT *
5410           util_bitcount(aux_modes & ((1 << aux_usage) - 1));
5411 }
5412 
5413 #if GFX_VER == 9
5414 static void
surf_state_update_clear_value(struct iris_batch * batch,struct iris_resource * res,struct iris_surface_state * surf_state,enum isl_aux_usage aux_usage)5415 surf_state_update_clear_value(struct iris_batch *batch,
5416                               struct iris_resource *res,
5417                               struct iris_surface_state *surf_state,
5418                               enum isl_aux_usage aux_usage)
5419 {
5420    struct isl_device *isl_dev = &batch->screen->isl_dev;
5421    struct iris_bo *state_bo = iris_resource_bo(surf_state->ref.res);
5422    uint64_t real_offset = surf_state->ref.offset + IRIS_MEMZONE_BINDER_START;
5423    uint32_t offset_into_bo = real_offset - state_bo->address;
5424    uint32_t clear_offset = offset_into_bo +
5425       isl_dev->ss.clear_value_offset +
5426       surf_state_offset_for_aux(surf_state->aux_usages, aux_usage);
5427    uint32_t *color = res->aux.clear_color.u32;
5428 
5429    assert(isl_dev->ss.clear_value_size == 16);
5430 
5431    if (aux_usage == ISL_AUX_USAGE_HIZ) {
5432       iris_emit_pipe_control_write(batch, "update fast clear value (Z)",
5433                                    PIPE_CONTROL_WRITE_IMMEDIATE,
5434                                    state_bo, clear_offset, color[0]);
5435    } else {
5436       iris_emit_pipe_control_write(batch, "update fast clear color (RG__)",
5437                                    PIPE_CONTROL_WRITE_IMMEDIATE,
5438                                    state_bo, clear_offset,
5439                                    (uint64_t) color[0] |
5440                                    (uint64_t) color[1] << 32);
5441       iris_emit_pipe_control_write(batch, "update fast clear color (__BA)",
5442                                    PIPE_CONTROL_WRITE_IMMEDIATE,
5443                                    state_bo, clear_offset + 8,
5444                                    (uint64_t) color[2] |
5445                                    (uint64_t) color[3] << 32);
5446    }
5447 
5448    iris_emit_pipe_control_flush(batch,
5449                                 "update fast clear: state cache invalidate",
5450                                 PIPE_CONTROL_FLUSH_ENABLE |
5451                                 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
5452 }
5453 #endif
5454 
5455 static void
update_clear_value(struct iris_context * ice,struct iris_batch * batch,struct iris_resource * res,struct iris_surface_state * surf_state,struct isl_view * view)5456 update_clear_value(struct iris_context *ice,
5457                    struct iris_batch *batch,
5458                    struct iris_resource *res,
5459                    struct iris_surface_state *surf_state,
5460                    struct isl_view *view)
5461 {
5462    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5463    UNUSED unsigned aux_modes = surf_state->aux_usages;
5464 
5465    /* We only need to update the clear color in the surface state for gfx8 and
5466     * gfx9. Newer gens can read it directly from the clear color state buffer.
5467     */
5468 #if GFX_VER == 9
5469    /* Skip updating the ISL_AUX_USAGE_NONE surface state */
5470    aux_modes &= ~(1 << ISL_AUX_USAGE_NONE);
5471 
5472    while (aux_modes) {
5473       enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);
5474 
5475       surf_state_update_clear_value(batch, res, surf_state, aux_usage);
5476    }
5477 #elif GFX_VER == 8
5478    /* TODO: Could update rather than re-filling */
5479    alloc_surface_states(surf_state, surf_state->aux_usages);
5480 
5481    fill_surface_states(isl_dev, surf_state, res, &res->surf, view, 0, 0, 0);
5482 
5483    upload_surface_states(ice->state.surface_uploader, surf_state);
5484 #endif
5485 }
5486 
5487 static uint32_t
use_surface_state(struct iris_batch * batch,struct iris_surface_state * surf_state,enum isl_aux_usage aux_usage)5488 use_surface_state(struct iris_batch *batch,
5489                   struct iris_surface_state *surf_state,
5490                   enum isl_aux_usage aux_usage)
5491 {
5492    iris_use_pinned_bo(batch, iris_resource_bo(surf_state->ref.res), false,
5493                       IRIS_DOMAIN_NONE);
5494 
5495    return surf_state->ref.offset +
5496           surf_state_offset_for_aux(surf_state->aux_usages, aux_usage);
5497 }
5498 
5499 /**
5500  * Add a surface to the validation list, as well as the buffer containing
5501  * the corresponding SURFACE_STATE.
5502  *
5503  * Returns the binding table entry (offset to SURFACE_STATE).
5504  */
5505 static uint32_t
use_surface(struct iris_context * ice,struct iris_batch * batch,struct pipe_surface * p_surf,bool writeable,enum isl_aux_usage aux_usage,bool is_read_surface,enum iris_domain access)5506 use_surface(struct iris_context *ice,
5507             struct iris_batch *batch,
5508             struct pipe_surface *p_surf,
5509             bool writeable,
5510             enum isl_aux_usage aux_usage,
5511             bool is_read_surface,
5512             enum iris_domain access)
5513 {
5514    struct iris_surface *surf = (void *) p_surf;
5515    struct iris_resource *res = (void *) p_surf->texture;
5516 
5517    if (GFX_VER == 8 && is_read_surface && !surf->surface_state_read.ref.res) {
5518       upload_surface_states(ice->state.surface_uploader,
5519                             &surf->surface_state_read);
5520    }
5521 
5522    if (!surf->surface_state.ref.res) {
5523       upload_surface_states(ice->state.surface_uploader,
5524                             &surf->surface_state);
5525    }
5526 
5527    if (memcmp(&res->aux.clear_color, &surf->clear_color,
5528               sizeof(surf->clear_color)) != 0) {
5529       update_clear_value(ice, batch, res, &surf->surface_state, &surf->view);
5530       if (GFX_VER == 8) {
5531          update_clear_value(ice, batch, res, &surf->surface_state_read,
5532                             &surf->read_view);
5533       }
5534       surf->clear_color = res->aux.clear_color;
5535    }
5536 
5537    if (res->aux.clear_color_bo)
5538       iris_use_pinned_bo(batch, res->aux.clear_color_bo, false, access);
5539 
5540    if (res->aux.bo)
5541       iris_use_pinned_bo(batch, res->aux.bo, writeable, access);
5542 
5543    iris_use_pinned_bo(batch, res->bo, writeable, access);
5544 
5545    if (GFX_VER == 8 && is_read_surface) {
5546       return use_surface_state(batch, &surf->surface_state_read, aux_usage);
5547    } else {
5548       return use_surface_state(batch, &surf->surface_state, aux_usage);
5549    }
5550 }
5551 
5552 static uint32_t
use_sampler_view(struct iris_context * ice,struct iris_batch * batch,struct iris_sampler_view * isv)5553 use_sampler_view(struct iris_context *ice,
5554                  struct iris_batch *batch,
5555                  struct iris_sampler_view *isv)
5556 {
5557    enum isl_aux_usage aux_usage =
5558       iris_resource_texture_aux_usage(ice, isv->res, isv->view.format,
5559                                       isv->view.base_level, isv->view.levels);
5560 
5561    if (!isv->surface_state.ref.res)
5562       upload_surface_states(ice->state.surface_uploader, &isv->surface_state);
5563 
5564    if (memcmp(&isv->res->aux.clear_color, &isv->clear_color,
5565               sizeof(isv->clear_color)) != 0) {
5566       update_clear_value(ice, batch, isv->res, &isv->surface_state,
5567                          &isv->view);
5568       isv->clear_color = isv->res->aux.clear_color;
5569    }
5570 
5571    if (isv->res->aux.clear_color_bo) {
5572       iris_use_pinned_bo(batch, isv->res->aux.clear_color_bo,
5573                          false, IRIS_DOMAIN_SAMPLER_READ);
5574    }
5575 
5576    if (isv->res->aux.bo) {
5577       iris_use_pinned_bo(batch, isv->res->aux.bo,
5578                          false, IRIS_DOMAIN_SAMPLER_READ);
5579    }
5580 
5581    iris_use_pinned_bo(batch, isv->res->bo, false, IRIS_DOMAIN_SAMPLER_READ);
5582 
5583    return use_surface_state(batch, &isv->surface_state, aux_usage);
5584 }
5585 
5586 static uint32_t
use_ubo_ssbo(struct iris_batch * batch,struct iris_context * ice,struct pipe_shader_buffer * buf,struct iris_state_ref * surf_state,bool writable,enum iris_domain access)5587 use_ubo_ssbo(struct iris_batch *batch,
5588              struct iris_context *ice,
5589              struct pipe_shader_buffer *buf,
5590              struct iris_state_ref *surf_state,
5591              bool writable, enum iris_domain access)
5592 {
5593    if (!buf->buffer || !surf_state->res)
5594       return use_null_surface(batch, ice);
5595 
5596    iris_use_pinned_bo(batch, iris_resource_bo(buf->buffer), writable, access);
5597    iris_use_pinned_bo(batch, iris_resource_bo(surf_state->res), false,
5598                       IRIS_DOMAIN_NONE);
5599 
5600    return surf_state->offset;
5601 }
5602 
5603 static uint32_t
use_image(struct iris_batch * batch,struct iris_context * ice,struct iris_shader_state * shs,const struct shader_info * info,int i)5604 use_image(struct iris_batch *batch, struct iris_context *ice,
5605           struct iris_shader_state *shs, const struct shader_info *info,
5606           int i)
5607 {
5608    struct iris_image_view *iv = &shs->image[i];
5609    struct iris_resource *res = (void *) iv->base.resource;
5610 
5611    if (!res)
5612       return use_null_surface(batch, ice);
5613 
5614    bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
5615 
5616    iris_use_pinned_bo(batch, res->bo, write, IRIS_DOMAIN_NONE);
5617 
5618    if (res->aux.bo)
5619       iris_use_pinned_bo(batch, res->aux.bo, write, IRIS_DOMAIN_NONE);
5620 
5621    if (res->aux.clear_color_bo) {
5622       iris_use_pinned_bo(batch, res->aux.clear_color_bo, false,
5623                          IRIS_DOMAIN_NONE);
5624    }
5625 
5626    enum isl_aux_usage aux_usage = shs->image_aux_usage[i];
5627 
5628    return use_surface_state(batch, &iv->surface_state, aux_usage);
5629 }
5630 
5631 #define push_bt_entry(addr) \
5632    assert(addr >= surf_base_offset); \
5633    assert(s < shader->bt.size_bytes / sizeof(uint32_t)); \
5634    if (!pin_only) bt_map[s++] = (addr) - surf_base_offset;
5635 
5636 #define bt_assert(section) \
5637    if (!pin_only && shader->bt.used_mask[section] != 0) \
5638       assert(shader->bt.offsets[section] == s);
5639 
5640 /**
5641  * Populate the binding table for a given shader stage.
5642  *
5643  * This fills out the table of pointers to surfaces required by the shader,
5644  * and also adds those buffers to the validation list so the kernel can make
5645  * resident before running our batch.
5646  */
5647 static void
iris_populate_binding_table(struct iris_context * ice,struct iris_batch * batch,gl_shader_stage stage,bool pin_only)5648 iris_populate_binding_table(struct iris_context *ice,
5649                             struct iris_batch *batch,
5650                             gl_shader_stage stage,
5651                             bool pin_only)
5652 {
5653    const struct iris_binder *binder = &ice->state.binder;
5654    struct iris_compiled_shader *shader = ice->shaders.prog[stage];
5655    if (!shader)
5656       return;
5657 
5658    struct iris_binding_table *bt = &shader->bt;
5659    struct iris_shader_state *shs = &ice->state.shaders[stage];
5660    uint32_t surf_base_offset = GFX_VER < 11 ? binder->bo->address : 0;
5661 
5662    uint32_t *bt_map = binder->map + binder->bt_offset[stage];
5663    int s = 0;
5664 
5665    const struct shader_info *info = iris_get_shader_info(ice, stage);
5666    if (!info) {
5667       /* TCS passthrough doesn't need a binding table. */
5668       assert(stage == MESA_SHADER_TESS_CTRL);
5669       return;
5670    }
5671 
5672    if (stage == MESA_SHADER_COMPUTE &&
5673        shader->bt.used_mask[IRIS_SURFACE_GROUP_CS_WORK_GROUPS]) {
5674       /* surface for gl_NumWorkGroups */
5675       struct iris_state_ref *grid_data = &ice->state.grid_size;
5676       struct iris_state_ref *grid_state = &ice->state.grid_surf_state;
5677       iris_use_pinned_bo(batch, iris_resource_bo(grid_data->res), false,
5678                          IRIS_DOMAIN_PULL_CONSTANT_READ);
5679       iris_use_pinned_bo(batch, iris_resource_bo(grid_state->res), false,
5680                          IRIS_DOMAIN_NONE);
5681       push_bt_entry(grid_state->offset);
5682    }
5683 
5684    if (stage == MESA_SHADER_FRAGMENT) {
5685       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5686       /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
5687       if (cso_fb->nr_cbufs) {
5688          for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
5689             uint32_t addr;
5690             if (cso_fb->cbufs[i]) {
5691                addr = use_surface(ice, batch, cso_fb->cbufs[i], true,
5692                                   ice->state.draw_aux_usage[i], false,
5693                                   IRIS_DOMAIN_RENDER_WRITE);
5694             } else {
5695                addr = use_null_fb_surface(batch, ice);
5696             }
5697             push_bt_entry(addr);
5698          }
5699       } else if (GFX_VER < 11) {
5700          uint32_t addr = use_null_fb_surface(batch, ice);
5701          push_bt_entry(addr);
5702       }
5703    }
5704 
5705 #define foreach_surface_used(index, group) \
5706    bt_assert(group); \
5707    for (int index = 0; index < bt->sizes[group]; index++) \
5708       if (iris_group_index_to_bti(bt, group, index) != \
5709           IRIS_SURFACE_NOT_USED)
5710 
5711    foreach_surface_used(i, IRIS_SURFACE_GROUP_RENDER_TARGET_READ) {
5712       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5713       uint32_t addr;
5714       if (cso_fb->cbufs[i]) {
5715          addr = use_surface(ice, batch, cso_fb->cbufs[i],
5716                             false, ice->state.draw_aux_usage[i], true,
5717                             IRIS_DOMAIN_SAMPLER_READ);
5718          push_bt_entry(addr);
5719       }
5720    }
5721 
5722    foreach_surface_used(i, IRIS_SURFACE_GROUP_TEXTURE_LOW64) {
5723       struct iris_sampler_view *view = shs->textures[i];
5724       uint32_t addr = view ? use_sampler_view(ice, batch, view)
5725                            : use_null_surface(batch, ice);
5726       push_bt_entry(addr);
5727    }
5728 
5729    foreach_surface_used(i, IRIS_SURFACE_GROUP_TEXTURE_HIGH64) {
5730       struct iris_sampler_view *view = shs->textures[64 + i];
5731       uint32_t addr = view ? use_sampler_view(ice, batch, view)
5732                            : use_null_surface(batch, ice);
5733       push_bt_entry(addr);
5734    }
5735 
5736    foreach_surface_used(i, IRIS_SURFACE_GROUP_IMAGE) {
5737       uint32_t addr = use_image(batch, ice, shs, info, i);
5738       push_bt_entry(addr);
5739    }
5740 
5741    foreach_surface_used(i, IRIS_SURFACE_GROUP_UBO) {
5742       uint32_t addr = use_ubo_ssbo(batch, ice, &shs->constbuf[i],
5743                                    &shs->constbuf_surf_state[i], false,
5744                                    IRIS_DOMAIN_PULL_CONSTANT_READ);
5745       push_bt_entry(addr);
5746    }
5747 
5748    foreach_surface_used(i, IRIS_SURFACE_GROUP_SSBO) {
5749       uint32_t addr =
5750          use_ubo_ssbo(batch, ice, &shs->ssbo[i], &shs->ssbo_surf_state[i],
5751                       shs->writable_ssbos & (1u << i), IRIS_DOMAIN_NONE);
5752       push_bt_entry(addr);
5753    }
5754 
5755 #if 0
5756       /* XXX: YUV surfaces not implemented yet */
5757       bt_assert(plane_start[1], ...);
5758       bt_assert(plane_start[2], ...);
5759 #endif
5760 }
5761 
5762 static void
iris_use_optional_res(struct iris_batch * batch,struct pipe_resource * res,bool writeable,enum iris_domain access)5763 iris_use_optional_res(struct iris_batch *batch,
5764                       struct pipe_resource *res,
5765                       bool writeable,
5766                       enum iris_domain access)
5767 {
5768    if (res) {
5769       struct iris_bo *bo = iris_resource_bo(res);
5770       iris_use_pinned_bo(batch, bo, writeable, access);
5771    }
5772 }
5773 
5774 static void
pin_depth_and_stencil_buffers(struct iris_batch * batch,struct pipe_surface * zsbuf,struct iris_depth_stencil_alpha_state * cso_zsa)5775 pin_depth_and_stencil_buffers(struct iris_batch *batch,
5776                               struct pipe_surface *zsbuf,
5777                               struct iris_depth_stencil_alpha_state *cso_zsa)
5778 {
5779    if (!zsbuf)
5780       return;
5781 
5782    struct iris_resource *zres, *sres;
5783    iris_get_depth_stencil_resources(zsbuf->texture, &zres, &sres);
5784 
5785    if (zres) {
5786       iris_use_pinned_bo(batch, zres->bo, cso_zsa->depth_writes_enabled,
5787                          IRIS_DOMAIN_DEPTH_WRITE);
5788       if (zres->aux.bo) {
5789          iris_use_pinned_bo(batch, zres->aux.bo,
5790                             cso_zsa->depth_writes_enabled,
5791                             IRIS_DOMAIN_DEPTH_WRITE);
5792       }
5793    }
5794 
5795    if (sres) {
5796       iris_use_pinned_bo(batch, sres->bo, cso_zsa->stencil_writes_enabled,
5797                          IRIS_DOMAIN_DEPTH_WRITE);
5798    }
5799 }
5800 
5801 static uint32_t
pin_scratch_space(struct iris_context * ice,struct iris_batch * batch,const struct iris_compiled_shader * shader,gl_shader_stage stage)5802 pin_scratch_space(struct iris_context *ice,
5803                   struct iris_batch *batch,
5804                   const struct iris_compiled_shader *shader,
5805                   gl_shader_stage stage)
5806 {
5807    uint32_t scratch_addr = 0;
5808 
5809    if (shader->total_scratch > 0) {
5810       struct iris_bo *scratch_bo =
5811          iris_get_scratch_space(ice, shader->total_scratch, stage);
5812       iris_use_pinned_bo(batch, scratch_bo, true, IRIS_DOMAIN_NONE);
5813 
5814 #if GFX_VERx10 >= 125
5815       const struct iris_state_ref *ref =
5816          iris_get_scratch_surf(ice, shader->total_scratch);
5817       iris_use_pinned_bo(batch, iris_resource_bo(ref->res),
5818                          false, IRIS_DOMAIN_NONE);
5819       scratch_addr = ref->offset +
5820                      iris_resource_bo(ref->res)->address -
5821                      IRIS_MEMZONE_SCRATCH_START;
5822       assert((scratch_addr & 0x3f) == 0 && scratch_addr < (1 << 26));
5823 #else
5824       scratch_addr = scratch_bo->address;
5825 #endif
5826    }
5827 
5828    return scratch_addr;
5829 }
5830 
5831 /* ------------------------------------------------------------------- */
5832 
5833 /**
5834  * Pin any BOs which were installed by a previous batch, and restored
5835  * via the hardware logical context mechanism.
5836  *
5837  * We don't need to re-emit all state every batch - the hardware context
5838  * mechanism will save and restore it for us.  This includes pointers to
5839  * various BOs...which won't exist unless we ask the kernel to pin them
5840  * by adding them to the validation list.
5841  *
5842  * We can skip buffers if we've re-emitted those packets, as we're
5843  * overwriting those stale pointers with new ones, and don't actually
5844  * refer to the old BOs.
5845  */
5846 static void
iris_restore_render_saved_bos(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw)5847 iris_restore_render_saved_bos(struct iris_context *ice,
5848                               struct iris_batch *batch,
5849                               const struct pipe_draw_info *draw)
5850 {
5851    struct iris_genx_state *genx = ice->state.genx;
5852 
5853    const uint64_t clean = ~ice->state.dirty;
5854    const uint64_t stage_clean = ~ice->state.stage_dirty;
5855 
5856    if (clean & IRIS_DIRTY_CC_VIEWPORT) {
5857       iris_use_optional_res(batch, ice->state.last_res.cc_vp, false,
5858                             IRIS_DOMAIN_NONE);
5859    }
5860 
5861    if (clean & IRIS_DIRTY_SF_CL_VIEWPORT) {
5862       iris_use_optional_res(batch, ice->state.last_res.sf_cl_vp, false,
5863                             IRIS_DOMAIN_NONE);
5864    }
5865 
5866    if (clean & IRIS_DIRTY_BLEND_STATE) {
5867       iris_use_optional_res(batch, ice->state.last_res.blend, false,
5868                             IRIS_DOMAIN_NONE);
5869    }
5870 
5871    if (clean & IRIS_DIRTY_COLOR_CALC_STATE) {
5872       iris_use_optional_res(batch, ice->state.last_res.color_calc, false,
5873                             IRIS_DOMAIN_NONE);
5874    }
5875 
5876    if (clean & IRIS_DIRTY_SCISSOR_RECT) {
5877       iris_use_optional_res(batch, ice->state.last_res.scissor, false,
5878                             IRIS_DOMAIN_NONE);
5879    }
5880 
5881    if (ice->state.streamout_active && (clean & IRIS_DIRTY_SO_BUFFERS)) {
5882       for (int i = 0; i < 4; i++) {
5883          struct iris_stream_output_target *tgt =
5884             (void *) ice->state.so_target[i];
5885          if (tgt) {
5886             iris_use_pinned_bo(batch, iris_resource_bo(tgt->base.buffer),
5887                                true, IRIS_DOMAIN_OTHER_WRITE);
5888             iris_use_pinned_bo(batch, iris_resource_bo(tgt->offset.res),
5889                                true, IRIS_DOMAIN_OTHER_WRITE);
5890          }
5891       }
5892    }
5893 
5894    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
5895       if (!(stage_clean & (IRIS_STAGE_DIRTY_CONSTANTS_VS << stage)))
5896          continue;
5897 
5898       struct iris_shader_state *shs = &ice->state.shaders[stage];
5899       struct iris_compiled_shader *shader = ice->shaders.prog[stage];
5900 
5901       if (!shader)
5902          continue;
5903 
5904       for (int i = 0; i < 4; i++) {
5905          const struct iris_ubo_range *range = &shader->ubo_ranges[i];
5906 
5907          if (range->length == 0)
5908             continue;
5909 
5910          /* Range block is a binding table index, map back to UBO index. */
5911          unsigned block_index = iris_bti_to_group_index(
5912             &shader->bt, IRIS_SURFACE_GROUP_UBO, range->block);
5913          assert(block_index != IRIS_SURFACE_NOT_USED);
5914 
5915          struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index];
5916          struct iris_resource *res = (void *) cbuf->buffer;
5917 
5918          if (res)
5919             iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_OTHER_READ);
5920          else
5921             iris_use_pinned_bo(batch, batch->screen->workaround_bo, false,
5922                                IRIS_DOMAIN_OTHER_READ);
5923       }
5924    }
5925 
5926    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
5927       if (stage_clean & (IRIS_STAGE_DIRTY_BINDINGS_VS << stage)) {
5928          /* Re-pin any buffers referred to by the binding table. */
5929          iris_populate_binding_table(ice, batch, stage, true);
5930       }
5931    }
5932 
5933    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
5934       struct iris_shader_state *shs = &ice->state.shaders[stage];
5935       struct pipe_resource *res = shs->sampler_table.res;
5936       if (res)
5937          iris_use_pinned_bo(batch, iris_resource_bo(res), false,
5938                             IRIS_DOMAIN_NONE);
5939    }
5940 
5941    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
5942       if (stage_clean & (IRIS_STAGE_DIRTY_VS << stage)) {
5943          struct iris_compiled_shader *shader = ice->shaders.prog[stage];
5944 
5945          if (shader) {
5946             struct iris_bo *bo = iris_resource_bo(shader->assembly.res);
5947             iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
5948 
5949             pin_scratch_space(ice, batch, shader, stage);
5950          }
5951       }
5952    }
5953 
5954    if ((clean & IRIS_DIRTY_DEPTH_BUFFER) &&
5955        (clean & IRIS_DIRTY_WM_DEPTH_STENCIL)) {
5956       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5957       pin_depth_and_stencil_buffers(batch, cso_fb->zsbuf, ice->state.cso_zsa);
5958    }
5959 
5960    iris_use_optional_res(batch, ice->state.last_res.index_buffer, false,
5961                          IRIS_DOMAIN_VF_READ);
5962 
5963    if (clean & IRIS_DIRTY_VERTEX_BUFFERS) {
5964       uint64_t bound = ice->state.bound_vertex_buffers;
5965       while (bound) {
5966          const int i = u_bit_scan64(&bound);
5967          struct pipe_resource *res = genx->vertex_buffers[i].resource;
5968          iris_use_pinned_bo(batch, iris_resource_bo(res), false,
5969                             IRIS_DOMAIN_VF_READ);
5970       }
5971    }
5972 }
5973 
5974 static void
iris_restore_compute_saved_bos(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)5975 iris_restore_compute_saved_bos(struct iris_context *ice,
5976                                struct iris_batch *batch,
5977                                const struct pipe_grid_info *grid)
5978 {
5979    const uint64_t stage_clean = ~ice->state.stage_dirty;
5980 
5981    const int stage = MESA_SHADER_COMPUTE;
5982    struct iris_shader_state *shs = &ice->state.shaders[stage];
5983 
5984    if (stage_clean & IRIS_STAGE_DIRTY_BINDINGS_CS) {
5985       /* Re-pin any buffers referred to by the binding table. */
5986       iris_populate_binding_table(ice, batch, stage, true);
5987    }
5988 
5989    struct pipe_resource *sampler_res = shs->sampler_table.res;
5990    if (sampler_res)
5991       iris_use_pinned_bo(batch, iris_resource_bo(sampler_res), false,
5992                          IRIS_DOMAIN_NONE);
5993 
5994    if ((stage_clean & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS) &&
5995        (stage_clean & IRIS_STAGE_DIRTY_BINDINGS_CS) &&
5996        (stage_clean & IRIS_STAGE_DIRTY_CONSTANTS_CS) &&
5997        (stage_clean & IRIS_STAGE_DIRTY_CS)) {
5998       iris_use_optional_res(batch, ice->state.last_res.cs_desc, false,
5999                             IRIS_DOMAIN_NONE);
6000    }
6001 
6002    if (stage_clean & IRIS_STAGE_DIRTY_CS) {
6003       struct iris_compiled_shader *shader = ice->shaders.prog[stage];
6004 
6005       if (shader) {
6006          struct iris_bo *bo = iris_resource_bo(shader->assembly.res);
6007          iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
6008 
6009          if (GFX_VERx10 < 125) {
6010             struct iris_bo *curbe_bo =
6011                iris_resource_bo(ice->state.last_res.cs_thread_ids);
6012             iris_use_pinned_bo(batch, curbe_bo, false, IRIS_DOMAIN_NONE);
6013          }
6014 
6015          pin_scratch_space(ice, batch, shader, stage);
6016       }
6017    }
6018 }
6019 
6020 /**
6021  * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
6022  */
6023 static void
iris_update_binder_address(struct iris_batch * batch,struct iris_binder * binder)6024 iris_update_binder_address(struct iris_batch *batch,
6025                            struct iris_binder *binder)
6026 {
6027    if (batch->last_binder_address == binder->bo->address)
6028       return;
6029 
6030    struct isl_device *isl_dev = &batch->screen->isl_dev;
6031    uint32_t mocs = isl_mocs(isl_dev, 0, false);
6032 
6033    iris_batch_sync_region_start(batch);
6034 
6035 #if GFX_VER >= 11
6036    /* Use 3DSTATE_BINDING_TABLE_POOL_ALLOC on Icelake and later */
6037 
6038 #if GFX_VERx10 == 120
6039    /* Wa_1607854226:
6040     *
6041     *  Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
6042     *  mode by putting the pipeline temporarily in 3D mode..
6043     */
6044    if (batch->name == IRIS_BATCH_COMPUTE)
6045       emit_pipeline_select(batch, _3D);
6046 #endif
6047 
6048    iris_emit_pipe_control_flush(batch, "Stall for binder realloc",
6049                                 PIPE_CONTROL_CS_STALL);
6050 
6051    iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
6052       btpa.BindingTablePoolBaseAddress = ro_bo(binder->bo, 0);
6053       btpa.BindingTablePoolBufferSize = binder->size / 4096;
6054 #if GFX_VERx10 < 125
6055       btpa.BindingTablePoolEnable = true;
6056 #endif
6057       btpa.MOCS = mocs;
6058    }
6059 
6060 #if GFX_VERx10 == 120
6061    /* Wa_1607854226:
6062     *
6063     *  Put the pipeline back into compute mode.
6064     */
6065    if (batch->name == IRIS_BATCH_COMPUTE)
6066       emit_pipeline_select(batch, GPGPU);
6067 #endif
6068 #else
6069    /* Use STATE_BASE_ADDRESS on older platforms */
6070    flush_before_state_base_change(batch);
6071 
6072    iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
6073       sba.SurfaceStateBaseAddressModifyEnable = true;
6074       sba.SurfaceStateBaseAddress = ro_bo(binder->bo, 0);
6075 
6076       /* The hardware appears to pay attention to the MOCS fields even
6077        * if you don't set the "Address Modify Enable" bit for the base.
6078        */
6079       sba.GeneralStateMOCS            = mocs;
6080       sba.StatelessDataPortAccessMOCS = mocs;
6081       sba.DynamicStateMOCS            = mocs;
6082       sba.IndirectObjectMOCS          = mocs;
6083       sba.InstructionMOCS             = mocs;
6084       sba.SurfaceStateMOCS            = mocs;
6085 #if GFX_VER >= 9
6086       sba.BindlessSurfaceStateMOCS    = mocs;
6087 #endif
6088 #if GFX_VERx10 >= 125
6089       sba.L1CacheControl = L1CC_WB;
6090 #endif
6091    }
6092 #endif
6093 
6094    flush_after_state_base_change(batch);
6095    iris_batch_sync_region_end(batch);
6096 
6097    batch->last_binder_address = binder->bo->address;
6098 }
6099 
6100 static inline void
iris_viewport_zmin_zmax(const struct pipe_viewport_state * vp,bool halfz,bool window_space_position,float * zmin,float * zmax)6101 iris_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
6102                         bool window_space_position, float *zmin, float *zmax)
6103 {
6104    if (window_space_position) {
6105       *zmin = 0.f;
6106       *zmax = 1.f;
6107       return;
6108    }
6109    util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
6110 }
6111 
6112 /* Wa_16018063123 */
6113 static inline void
batch_emit_fast_color_dummy_blit(struct iris_batch * batch)6114 batch_emit_fast_color_dummy_blit(struct iris_batch *batch)
6115 {
6116 #if GFX_VERx10 >= 125
6117    iris_emit_cmd(batch, GENX(XY_FAST_COLOR_BLT), blt) {
6118       blt.DestinationBaseAddress = batch->screen->workaround_address;
6119       blt.DestinationMOCS = iris_mocs(batch->screen->workaround_address.bo,
6120                                       &batch->screen->isl_dev,
6121                                       ISL_SURF_USAGE_BLITTER_DST_BIT);
6122       blt.DestinationPitch = 63;
6123       blt.DestinationX2 = 1;
6124       blt.DestinationY2 = 4;
6125       blt.DestinationSurfaceWidth = 1;
6126       blt.DestinationSurfaceHeight = 4;
6127       blt.DestinationSurfaceType = XY_SURFTYPE_2D;
6128       blt.DestinationSurfaceQPitch = 4;
6129       blt.DestinationTiling = XY_TILE_LINEAR;
6130    }
6131 #endif
6132 }
6133 
6134 #if GFX_VER >= 12
6135 static void
invalidate_aux_map_state_per_engine(struct iris_batch * batch)6136 invalidate_aux_map_state_per_engine(struct iris_batch *batch)
6137 {
6138    uint64_t register_addr = 0;
6139 
6140    switch (batch->name) {
6141    case IRIS_BATCH_RENDER: {
6142       /* HSD 1209978178: docs say that before programming the aux table:
6143        *
6144        *    "Driver must ensure that the engine is IDLE but ensure it doesn't
6145        *    add extra flushes in the case it knows that the engine is already
6146        *    IDLE."
6147        *
6148        * An end of pipe sync is needed here, otherwise we see GPU hangs in
6149        * dEQP-GLES31.functional.copy_image.* tests.
6150        *
6151        * HSD 22012751911: SW Programming sequence when issuing aux invalidation:
6152        *
6153        *    "Render target Cache Flush + L3 Fabric Flush + State Invalidation + CS Stall"
6154        *
6155        * Notice we don't set the L3 Fabric Flush here, because we have
6156        * PIPE_CONTROL_CS_STALL. The PIPE_CONTROL::L3 Fabric Flush
6157        * documentation says :
6158        *
6159        *    "L3 Fabric Flush will ensure all the pending transactions in the
6160        *     L3 Fabric are flushed to global observation point. HW does
6161        *     implicit L3 Fabric Flush on all stalling flushes (both explicit
6162        *     and implicit) and on PIPECONTROL having Post Sync Operation
6163        *     enabled."
6164        *
6165        * Therefore setting L3 Fabric Flush here would be redundant.
6166        *
6167        * From Bspec 43904 (Register_CCSAuxiliaryTableInvalidate):
6168        * RCS engine idle sequence:
6169        *
6170        *    Gfx125+:
6171        *       PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall + Render
6172        *                      Target Cache Flush + Depth Cache + CCS flush
6173        *
6174        */
6175       iris_emit_end_of_pipe_sync(batch, "Invalidate aux map table",
6176                                  PIPE_CONTROL_CS_STALL |
6177                                  PIPE_CONTROL_RENDER_TARGET_FLUSH |
6178                                  PIPE_CONTROL_STATE_CACHE_INVALIDATE |
6179                                  (GFX_VERx10 == 125 ?
6180                                   PIPE_CONTROL_CCS_CACHE_FLUSH : 0));
6181 
6182       register_addr = GENX(GFX_CCS_AUX_INV_num);
6183       break;
6184    }
6185    case IRIS_BATCH_COMPUTE: {
6186       /*
6187        * Notice we don't set the L3 Fabric Flush here, because we have
6188        * PIPE_CONTROL_CS_STALL. The PIPE_CONTROL::L3 Fabric Flush
6189        * documentation says :
6190        *
6191        *    "L3 Fabric Flush will ensure all the pending transactions in the
6192        *     L3 Fabric are flushed to global observation point. HW does
6193        *     implicit L3 Fabric Flush on all stalling flushes (both explicit
6194        *     and implicit) and on PIPECONTROL having Post Sync Operation
6195        *     enabled."
6196        *
6197        * Therefore setting L3 Fabric Flush here would be redundant.
6198        *
6199        * From Bspec 43904 (Register_CCSAuxiliaryTableInvalidate):
6200        * Compute engine idle sequence:
6201        *
6202        *    Gfx125+:
6203        *       PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall + CCS flush
6204        */
6205       iris_emit_end_of_pipe_sync(batch, "Invalidate aux map table",
6206                                  PIPE_CONTROL_DATA_CACHE_FLUSH |
6207                                  PIPE_CONTROL_CS_STALL |
6208                                  (GFX_VERx10 == 125 ?
6209                                   PIPE_CONTROL_CCS_CACHE_FLUSH : 0));
6210 
6211       register_addr = GENX(COMPCS0_CCS_AUX_INV_num);
6212       break;
6213    }
6214    case IRIS_BATCH_BLITTER: {
6215 #if GFX_VERx10 >= 125
6216       /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
6217       if (intel_needs_workaround(batch->screen->devinfo, 16018063123))
6218          batch_emit_fast_color_dummy_blit(batch);
6219 
6220       /*
6221        * Notice we don't set the L3 Fabric Flush here, because we have
6222        * PIPE_CONTROL_CS_STALL. The PIPE_CONTROL::L3 Fabric Flush
6223        * documentation says :
6224        *
6225        *    "L3 Fabric Flush will ensure all the pending transactions in the
6226        *     L3 Fabric are flushed to global observation point. HW does
6227        *     implicit L3 Fabric Flush on all stalling flushes (both explicit
6228        *     and implicit) and on PIPECONTROL having Post Sync Operation
6229        *     enabled."
6230        *
6231        * Therefore setting L3 Fabric Flush here would be redundant.
6232        *
6233        * From Bspec 43904 (Register_CCSAuxiliaryTableInvalidate):
6234        * Blitter engine idle sequence:
6235        *
6236        *    Gfx125+:
6237        *       MI_FLUSH_DW (dw0;b16 – flush CCS)
6238        */
6239       iris_emit_cmd(batch, GENX(MI_FLUSH_DW), fd) {
6240          fd.FlushCCS = true;
6241       }
6242       register_addr = GENX(BCS_CCS_AUX_INV_num);
6243 #endif
6244       break;
6245    }
6246    default:
6247       unreachable("Invalid batch for aux map invalidation");
6248       break;
6249    }
6250 
6251    if (register_addr != 0) {
6252       /* If the aux-map state number increased, then we need to rewrite the
6253        * register. Rewriting the register is used to both set the aux-map
6254        * translation table address, and also to invalidate any previously
6255        * cached translations.
6256        */
6257       iris_load_register_imm32(batch, register_addr, 1);
6258 
6259       /* HSD 22012751911: SW Programming sequence when issuing aux invalidation:
6260        *
6261        *    "Poll Aux Invalidation bit once the invalidation is set (Register
6262        *     4208 bit 0)"
6263        */
6264       iris_emit_cmd(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
6265          sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
6266          sem.WaitMode = PollingMode;
6267          sem.RegisterPollMode = true;
6268          sem.SemaphoreDataDword = 0x0;
6269          sem.SemaphoreAddress = ro_bo(NULL, register_addr);
6270       }
6271    }
6272 }
6273 
6274 void
genX(invalidate_aux_map_state)6275 genX(invalidate_aux_map_state)(struct iris_batch *batch)
6276 {
6277    struct iris_screen *screen = batch->screen;
6278    void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr);
6279    if (!aux_map_ctx)
6280       return;
6281    uint32_t aux_map_state_num = intel_aux_map_get_state_num(aux_map_ctx);
6282    if (batch->last_aux_map_state != aux_map_state_num) {
6283       invalidate_aux_map_state_per_engine(batch);
6284       batch->last_aux_map_state = aux_map_state_num;
6285    }
6286 }
6287 
6288 static void
init_aux_map_state(struct iris_batch * batch)6289 init_aux_map_state(struct iris_batch *batch)
6290 {
6291    struct iris_screen *screen = batch->screen;
6292    void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr);
6293    if (!aux_map_ctx)
6294       return;
6295 
6296    uint64_t base_addr = intel_aux_map_get_base(aux_map_ctx);
6297    assert(base_addr != 0 && align64(base_addr, 32 * 1024) == base_addr);
6298 
6299    uint32_t reg = 0;
6300    switch (batch->name) {
6301    case IRIS_BATCH_COMPUTE:
6302       if (iris_bufmgr_compute_engine_supported(screen->bufmgr)) {
6303          reg = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num);
6304          break;
6305       }
6306       /* fallthrough */
6307       FALLTHROUGH;
6308    case IRIS_BATCH_RENDER:
6309       reg = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
6310       break;
6311    case IRIS_BATCH_BLITTER:
6312 #if GFX_VERx10 >= 125
6313       reg = GENX(BCS_AUX_TABLE_BASE_ADDR_num);
6314 #endif
6315       break;
6316    default:
6317       unreachable("Invalid batch for aux map init.");
6318    }
6319 
6320    if (reg)
6321       iris_load_register_imm64(batch, reg, base_addr);
6322 }
6323 #endif
6324 
6325 struct push_bos {
6326    struct {
6327       struct iris_address addr;
6328       uint32_t length;
6329    } buffers[4];
6330    int buffer_count;
6331    uint32_t max_length;
6332 };
6333 
6334 static void
setup_constant_buffers(struct iris_context * ice,struct iris_batch * batch,int stage,struct push_bos * push_bos)6335 setup_constant_buffers(struct iris_context *ice,
6336                        struct iris_batch *batch,
6337                        int stage,
6338                        struct push_bos *push_bos)
6339 {
6340    struct iris_shader_state *shs = &ice->state.shaders[stage];
6341    struct iris_compiled_shader *shader = ice->shaders.prog[stage];
6342 
6343    uint32_t push_range_sum = 0;
6344 
6345    int n = 0;
6346    for (int i = 0; i < 4; i++) {
6347       const struct iris_ubo_range *range = &shader->ubo_ranges[i];
6348 
6349       if (range->length == 0)
6350          continue;
6351 
6352       push_range_sum += range->length;
6353 
6354       if (range->length > push_bos->max_length)
6355          push_bos->max_length = range->length;
6356 
6357       /* Range block is a binding table index, map back to UBO index. */
6358       unsigned block_index = iris_bti_to_group_index(
6359          &shader->bt, IRIS_SURFACE_GROUP_UBO, range->block);
6360       assert(block_index != IRIS_SURFACE_NOT_USED);
6361 
6362       struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index];
6363       struct iris_resource *res = (void *) cbuf->buffer;
6364 
6365       assert(cbuf->buffer_offset % 32 == 0);
6366 
6367       if (res)
6368          iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_OTHER_READ);
6369 
6370       push_bos->buffers[n].length = range->length;
6371       push_bos->buffers[n].addr =
6372          res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
6373          : batch->screen->workaround_address;
6374       n++;
6375    }
6376 
6377    /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
6378     *
6379     *    "The sum of all four read length fields must be less than or
6380     *    equal to the size of 64."
6381     */
6382    assert(push_range_sum <= 64);
6383 
6384    push_bos->buffer_count = n;
6385 }
6386 
6387 static void
emit_push_constant_packets(struct iris_context * ice,struct iris_batch * batch,int stage,const struct push_bos * push_bos)6388 emit_push_constant_packets(struct iris_context *ice,
6389                            struct iris_batch *batch,
6390                            int stage,
6391                            const struct push_bos *push_bos)
6392 {
6393    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
6394 
6395    iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
6396       pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
6397 
6398 #if GFX_VER >= 9
6399       pkt.MOCS = isl_mocs(isl_dev, 0, false);
6400 #endif
6401 
6402       /* The Skylake PRM contains the following restriction:
6403        *
6404        *    "The driver must ensure The following case does not occur
6405        *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
6406        *     buffer 3 read length equal to zero committed followed by a
6407        *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
6408        *     zero committed."
6409        *
6410        * To avoid this, we program the buffers in the highest slots.
6411        * This way, slot 0 is only used if slot 3 is also used.
6412        */
6413       const int n = push_bos->buffer_count;
6414       assert(n <= 4);
6415       const unsigned shift = 4 - n;
6416       for (int i = 0; i < n; i++) {
6417          pkt.ConstantBody.ReadLength[i + shift] =
6418             push_bos->buffers[i].length;
6419          pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
6420       }
6421    }
6422 }
6423 
6424 #if GFX_VER >= 12
6425 static void
emit_null_push_constant_tbimr_workaround(struct iris_batch * batch)6426 emit_null_push_constant_tbimr_workaround(struct iris_batch *batch)
6427 {
6428    struct isl_device *isl_dev = &batch->screen->isl_dev;
6429    /* Pass a single-register push constant payload for the PS
6430     * stage even if empty, since PS invocations with zero push
6431     * constant cycles have been found to cause hangs with TBIMR
6432     * enabled.  See HSDES #22020184996.
6433     *
6434     * XXX - Use workaround infrastructure and final workaround
6435     *       when provided by hardware team.
6436     */
6437    const struct iris_address null_addr = {
6438       .bo = batch->screen->workaround_bo,
6439       .offset = 1024,
6440    };
6441    const uint32_t num_dwords = 2 + 2 * 1;
6442    uint32_t const_all[num_dwords];
6443    uint32_t *dw = &const_all[0];
6444 
6445    iris_pack_command(GENX(3DSTATE_CONSTANT_ALL), dw, all) {
6446       all.DWordLength = num_dwords - 2;
6447       all.MOCS = isl_mocs(isl_dev, 0, false);
6448       all.ShaderUpdateEnable = (1 << MESA_SHADER_FRAGMENT);
6449       all.PointerBufferMask = 1;
6450    }
6451    dw += 2;
6452 
6453    _iris_pack_state(batch, GENX(3DSTATE_CONSTANT_ALL_DATA), dw, data) {
6454       data.PointerToConstantBuffer = null_addr;
6455       data.ConstantBufferReadLength = 1;
6456    }
6457 
6458    iris_batch_emit(batch, const_all, sizeof(uint32_t) * num_dwords);
6459 }
6460 
6461 static void
emit_push_constant_packet_all(struct iris_context * ice,struct iris_batch * batch,uint32_t shader_mask,const struct push_bos * push_bos)6462 emit_push_constant_packet_all(struct iris_context *ice,
6463                               struct iris_batch *batch,
6464                               uint32_t shader_mask,
6465                               const struct push_bos *push_bos)
6466 {
6467    struct isl_device *isl_dev = &batch->screen->isl_dev;
6468 
6469    if (!push_bos) {
6470       if (batch->screen->devinfo->needs_null_push_constant_tbimr_workaround &&
6471           (shader_mask & (1 << MESA_SHADER_FRAGMENT))) {
6472          emit_null_push_constant_tbimr_workaround(batch);
6473          shader_mask &= ~(1 << MESA_SHADER_FRAGMENT);
6474       }
6475 
6476       if (shader_mask) {
6477          iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_ALL), pc) {
6478             pc.ShaderUpdateEnable = shader_mask;
6479             pc.MOCS = iris_mocs(NULL, isl_dev, 0);
6480          }
6481       }
6482       return;
6483    }
6484 
6485    const uint32_t n = push_bos->buffer_count;
6486    const uint32_t max_pointers = 4;
6487    const uint32_t num_dwords = 2 + 2 * n;
6488    uint32_t const_all[2 + 2 * max_pointers];
6489    uint32_t *dw = &const_all[0];
6490 
6491    assert(n <= max_pointers);
6492    iris_pack_command(GENX(3DSTATE_CONSTANT_ALL), dw, all) {
6493       all.DWordLength = num_dwords - 2;
6494       all.MOCS = isl_mocs(isl_dev, 0, false);
6495       all.ShaderUpdateEnable = shader_mask;
6496       all.PointerBufferMask = (1 << n) - 1;
6497    }
6498    dw += 2;
6499 
6500    for (int i = 0; i < n; i++) {
6501       _iris_pack_state(batch, GENX(3DSTATE_CONSTANT_ALL_DATA),
6502                        dw + i * 2, data) {
6503          data.PointerToConstantBuffer = push_bos->buffers[i].addr;
6504          data.ConstantBufferReadLength = push_bos->buffers[i].length;
6505       }
6506    }
6507    iris_batch_emit(batch, const_all, sizeof(uint32_t) * num_dwords);
6508 }
6509 #endif
6510 
6511 void
genX(emit_depth_state_workarounds)6512 genX(emit_depth_state_workarounds)(struct iris_context *ice,
6513                                    struct iris_batch *batch,
6514                                    const struct isl_surf *surf)
6515 {
6516 #if INTEL_NEEDS_WA_1808121037
6517    const bool is_d16_1x_msaa = surf->format == ISL_FORMAT_R16_UNORM &&
6518                                surf->samples == 1;
6519 
6520    switch (ice->state.genx->depth_reg_mode) {
6521    case IRIS_DEPTH_REG_MODE_HW_DEFAULT:
6522       if (!is_d16_1x_msaa)
6523          return;
6524       break;
6525    case IRIS_DEPTH_REG_MODE_D16_1X_MSAA:
6526       if (is_d16_1x_msaa)
6527          return;
6528       break;
6529    case IRIS_DEPTH_REG_MODE_UNKNOWN:
6530       break;
6531    }
6532 
6533    /* We'll change some CHICKEN registers depending on the depth surface
6534     * format. Do a depth flush and stall so the pipeline is not using these
6535     * settings while we change the registers.
6536     */
6537    iris_emit_end_of_pipe_sync(batch,
6538                               "Workaround: Stop pipeline for Wa_1808121037",
6539                               PIPE_CONTROL_DEPTH_STALL |
6540                               PIPE_CONTROL_DEPTH_CACHE_FLUSH);
6541 
6542    /* Wa_1808121037
6543     *
6544     * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
6545     * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
6546     */
6547    iris_emit_reg(batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
6548       reg.HIZPlaneOptimizationdisablebit = is_d16_1x_msaa;
6549       reg.HIZPlaneOptimizationdisablebitMask = true;
6550    }
6551 
6552    ice->state.genx->depth_reg_mode =
6553       is_d16_1x_msaa ? IRIS_DEPTH_REG_MODE_D16_1X_MSAA :
6554                        IRIS_DEPTH_REG_MODE_HW_DEFAULT;
6555 #endif
6556 }
6557 
6558 /* Calculate TBIMR tiling parameters adequate for the current pipeline
6559  * setup.  Return true if TBIMR should be enabled.
6560  */
6561 UNUSED static bool
calculate_tile_dimensions(struct iris_context * ice,unsigned * tile_width,unsigned * tile_height)6562 calculate_tile_dimensions(struct iris_context *ice,
6563                           unsigned *tile_width, unsigned *tile_height)
6564 {
6565    struct iris_screen *screen = (void *)ice->ctx.screen;
6566    const struct intel_device_info *devinfo = screen->devinfo;
6567 
6568    assert(GFX_VER == 12);
6569    const unsigned aux_scale = ISL_MAIN_TO_CCS_SIZE_RATIO_XE;
6570 
6571    /* Perform a rough calculation of the tile cache footprint of the
6572     * pixel pipeline, approximating it as the sum of the amount of
6573     * memory used per pixel by every render target, depth, stencil and
6574     * auxiliary surfaces bound to the pipeline.
6575     */
6576    unsigned pixel_size = 0;
6577 
6578    struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
6579 
6580    if (cso->width == 0 || cso->height == 0)
6581       return false;
6582 
6583    for (unsigned i = 0; i < cso->nr_cbufs; i++) {
6584       const struct iris_surface *surf = (void *)cso->cbufs[i];
6585 
6586       if (surf) {
6587          const struct iris_resource *res = (void *)surf->base.texture;
6588 
6589          pixel_size += intel_calculate_surface_pixel_size(&res->surf);
6590 
6591          /* XXX - Pessimistic, in some cases it might be helpful to neglect
6592           *       aux surface traffic.
6593           */
6594          if (ice->state.draw_aux_usage[i]) {
6595             pixel_size += intel_calculate_surface_pixel_size(&res->aux.surf);
6596 
6597             if (isl_aux_usage_has_ccs(res->aux.usage)) {
6598                pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
6599                                              &res->surf), aux_scale);
6600             }
6601          }
6602       }
6603    }
6604 
6605    if (cso->zsbuf) {
6606       struct iris_resource *zres;
6607       struct iris_resource *sres;
6608       iris_get_depth_stencil_resources(cso->zsbuf->texture, &zres, &sres);
6609 
6610       if (zres) {
6611          pixel_size += intel_calculate_surface_pixel_size(&zres->surf);
6612 
6613          /* XXX - Pessimistic, in some cases it might be helpful to neglect
6614           *       aux surface traffic.
6615           */
6616          if (iris_resource_level_has_hiz(devinfo, zres, cso->zsbuf->u.tex.level)) {
6617             pixel_size += intel_calculate_surface_pixel_size(&zres->aux.surf);
6618 
6619             if (isl_aux_usage_has_ccs(zres->aux.usage)) {
6620                pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
6621                                              &zres->surf), aux_scale);
6622             }
6623          }
6624       }
6625 
6626       if (sres) {
6627          pixel_size += intel_calculate_surface_pixel_size(&sres->surf);
6628       }
6629    }
6630 
6631    /* Compute a tile layout that allows reasonable utilization of the
6632     * tile cache based on the per-pixel cache footprint estimated
6633     * above.
6634     */
6635    intel_calculate_tile_dimensions(devinfo, screen->l3_config_3d,
6636                                    32, 32, cso->width, cso->height, pixel_size,
6637                                    tile_width, tile_height);
6638 
6639    /* Perform TBIMR tile passes only if the framebuffer covers more
6640     * than a single tile.
6641     */
6642    return *tile_width < cso->width || *tile_height < cso->height;
6643 }
6644 
6645 static void
iris_preemption_streamout_wa(struct iris_context * ice,struct iris_batch * batch,bool enable)6646 iris_preemption_streamout_wa(struct iris_context *ice,
6647                              struct iris_batch *batch,
6648                              bool enable)
6649 {
6650 #if GFX_VERx10 >= 120
6651    if (!intel_needs_workaround(batch->screen->devinfo, 16013994831))
6652       return;
6653 
6654    iris_emit_reg(batch, GENX(CS_CHICKEN1), reg) {
6655       reg.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = !enable;
6656       reg.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
6657    }
6658 
6659    /* Emit CS_STALL and 250 noops. */
6660    iris_emit_pipe_control_flush(batch, "workaround: Wa_16013994831",
6661                                 PIPE_CONTROL_CS_STALL);
6662    for (unsigned i = 0; i < 250; i++)
6663       iris_emit_cmd(batch, GENX(MI_NOOP), noop);
6664 
6665    ice->state.genx->object_preemption = enable;
6666 #endif
6667 }
6668 
6669 static void
shader_program_uses_primitive_id(struct iris_context * ice,struct iris_batch * batch,struct iris_compiled_shader * shader,gl_shader_stage stage,bool * uses_primitive_id)6670 shader_program_uses_primitive_id(struct iris_context *ice,
6671                                  struct iris_batch *batch,
6672                                  struct iris_compiled_shader *shader,
6673                                  gl_shader_stage stage,
6674                                  bool *uses_primitive_id)
6675 {
6676    switch (stage) {
6677    case MESA_SHADER_TESS_CTRL: {
6678       struct iris_tcs_data *tcs_data = iris_tcs_data(shader);
6679       *uses_primitive_id |= tcs_data->include_primitive_id;
6680       break;
6681    }
6682    case MESA_SHADER_TESS_EVAL: {
6683       struct iris_tes_data *tes_data = iris_tes_data(shader);
6684       *uses_primitive_id |= tes_data->include_primitive_id;
6685       break;
6686    }
6687    default:
6688       break;
6689    }
6690 
6691    struct iris_compiled_shader *gs_shader =
6692       ice->shaders.prog[MESA_SHADER_GEOMETRY];
6693    const struct iris_gs_data *gs_data =
6694       gs_shader ? iris_gs_data(gs_shader) : NULL;
6695 
6696    *uses_primitive_id |= gs_data && gs_data->include_primitive_id;
6697 }
6698 
6699 static void
emit_wa_18020335297_dummy_draw(struct iris_batch * batch)6700 emit_wa_18020335297_dummy_draw(struct iris_batch *batch)
6701 {
6702 #if GFX_VERx10 >= 125
6703    iris_emit_cmd(batch, GENX(3DSTATE_VFG), vfg) {
6704       vfg.DistributionMode = RR_STRICT;
6705    }
6706    iris_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
6707       vf.GeometryDistributionEnable = true;
6708    }
6709 #endif
6710 
6711 #if GFX_VER >= 12
6712    iris_emit_cmd(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
6713       pr.ReplicaMask = 1;
6714    }
6715 #endif
6716 
6717    iris_emit_cmd(batch, GENX(3DSTATE_RASTER), rr) {
6718       rr.CullMode = CULLMODE_NONE;
6719       rr.FrontFaceFillMode = FILL_MODE_SOLID;
6720       rr.BackFaceFillMode = FILL_MODE_SOLID;
6721    }
6722 
6723    iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) { }
6724    iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgvs) { }
6725 
6726 #if GFX_VER >= 11
6727    iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS_2), sgvs2) { }
6728 #endif
6729 
6730    iris_emit_cmd(batch, GENX(3DSTATE_CLIP), clip) {
6731       clip.ClipEnable = true;
6732       clip.ClipMode = CLIPMODE_REJECT_ALL;
6733    }
6734 
6735    iris_emit_cmd(batch, GENX(3DSTATE_VS), vs) { }
6736    iris_emit_cmd(batch, GENX(3DSTATE_GS), gs) { }
6737    iris_emit_cmd(batch, GENX(3DSTATE_HS), hs) { }
6738    iris_emit_cmd(batch, GENX(3DSTATE_TE), te) { }
6739    iris_emit_cmd(batch, GENX(3DSTATE_DS), ds) { }
6740    iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), so) { }
6741 
6742    uint32_t vertex_elements[1 + 2 * GENX(VERTEX_ELEMENT_STATE_length)];
6743    uint32_t *ve_pack_dest = &vertex_elements[1];
6744 
6745    iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), vertex_elements, ve) {
6746       ve.DWordLength = 1 + GENX(VERTEX_ELEMENT_STATE_length) * 2 -
6747                        GENX(3DSTATE_VERTEX_ELEMENTS_length_bias);
6748    }
6749 
6750    for (int i = 0; i < 2; i++) {
6751       iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
6752          ve.Valid = true;
6753          ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
6754          ve.Component0Control = VFCOMP_STORE_0;
6755          ve.Component1Control = VFCOMP_STORE_0;
6756          ve.Component2Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP;
6757          ve.Component3Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP;
6758       }
6759       ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
6760    }
6761 
6762    iris_batch_emit(batch, vertex_elements, sizeof(uint32_t) *
6763                    (1 + 2 * GENX(VERTEX_ELEMENT_STATE_length)));
6764 
6765    iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
6766       topo.PrimitiveTopologyType = _3DPRIM_TRILIST;
6767    }
6768 
6769    /* Emit dummy draw per slice. */
6770    for (unsigned i = 0; i < batch->screen->devinfo->num_slices; i++) {
6771       iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
6772          prim.VertexCountPerInstance = 3;
6773          prim.PrimitiveTopologyType = _3DPRIM_TRILIST;
6774          prim.InstanceCount = 1;
6775          prim.VertexAccessType = SEQUENTIAL;
6776       }
6777    }
6778 }
6779 
6780 static void
iris_upload_dirty_render_state(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw,bool skip_vb_params)6781 iris_upload_dirty_render_state(struct iris_context *ice,
6782                                struct iris_batch *batch,
6783                                const struct pipe_draw_info *draw,
6784                                bool skip_vb_params)
6785 {
6786    struct iris_screen *screen = batch->screen;
6787    struct iris_border_color_pool *border_color_pool =
6788       iris_bufmgr_get_border_color_pool(screen->bufmgr);
6789 
6790    /* Re-emit 3DSTATE_DS before any 3DPRIMITIVE when tessellation is on */
6791    if (intel_needs_workaround(batch->screen->devinfo, 22018402687) &&
6792        ice->shaders.prog[MESA_SHADER_TESS_EVAL])
6793       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TES;
6794 
6795    uint64_t dirty = ice->state.dirty;
6796    uint64_t stage_dirty = ice->state.stage_dirty;
6797 
6798    if (!(dirty & IRIS_ALL_DIRTY_FOR_RENDER) &&
6799        !(stage_dirty & IRIS_ALL_STAGE_DIRTY_FOR_RENDER))
6800       return;
6801 
6802    struct iris_genx_state *genx = ice->state.genx;
6803    struct iris_binder *binder = &ice->state.binder;
6804    struct iris_fs_data *fs_data =
6805       iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
6806 
6807    /* When MSAA is enabled, instead of using BLENDFACTOR_ZERO use
6808     * CONST_COLOR, CONST_ALPHA and supply zero by using blend constants.
6809     */
6810    bool needs_wa_14018912822 =
6811       screen->driconf.intel_enable_wa_14018912822 &&
6812       intel_needs_workaround(batch->screen->devinfo, 14018912822) &&
6813       util_framebuffer_get_num_samples(&ice->state.framebuffer) > 1;
6814 
6815    if (dirty & IRIS_DIRTY_CC_VIEWPORT) {
6816       const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
6817       uint32_t cc_vp_address;
6818       bool wa_18020335297_applied = false;
6819 
6820       /* Wa_18020335297 - Apply the WA when viewport ptr is reprogrammed. */
6821       if (intel_needs_workaround(screen->devinfo, 18020335297) &&
6822           batch->name == IRIS_BATCH_RENDER &&
6823           ice->state.viewport_ptr_set) {
6824          emit_wa_18020335297_dummy_draw(batch);
6825          wa_18020335297_applied = true;
6826       }
6827 
6828       /* XXX: could avoid streaming for depth_clip [0,1] case. */
6829       uint32_t *cc_vp_map =
6830          stream_state(batch, ice->state.dynamic_uploader,
6831                       &ice->state.last_res.cc_vp,
6832                       4 * ice->state.num_viewports *
6833                       GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
6834       for (int i = 0; i < ice->state.num_viewports; i++) {
6835          float zmin, zmax;
6836          iris_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->clip_halfz,
6837                                  ice->state.window_space_position,
6838                                  &zmin, &zmax);
6839          if (cso_rast->depth_clip_near)
6840             zmin = 0.0;
6841          if (cso_rast->depth_clip_far)
6842             zmax = 1.0;
6843 
6844          iris_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
6845             ccv.MinimumDepth = zmin;
6846             ccv.MaximumDepth = zmax;
6847          }
6848 
6849          cc_vp_map += GENX(CC_VIEWPORT_length);
6850       }
6851 
6852       iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
6853          ptr.CCViewportPointer = cc_vp_address;
6854       }
6855 
6856       if (wa_18020335297_applied) {
6857 #if GFX_VER >= 12
6858          iris_emit_cmd(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) { }
6859 #endif
6860          /* Dirty all emitted WA state to make sure that current real
6861           * state is restored.
6862           */
6863          dirty |= IRIS_DIRTY_VFG |
6864                   IRIS_DIRTY_VF |
6865                   IRIS_DIRTY_RASTER |
6866                   IRIS_DIRTY_VF_STATISTICS |
6867                   IRIS_DIRTY_VF_SGVS |
6868                   IRIS_DIRTY_CLIP |
6869                   IRIS_DIRTY_STREAMOUT |
6870                   IRIS_DIRTY_VERTEX_ELEMENTS |
6871                   IRIS_DIRTY_VF_TOPOLOGY;
6872 
6873          for (int stage = 0; stage < MESA_SHADER_FRAGMENT; stage++) {
6874             if (ice->shaders.prog[stage])
6875                stage_dirty |= (IRIS_STAGE_DIRTY_VS << stage);
6876          }
6877       }
6878       ice->state.viewport_ptr_set = true;
6879    }
6880 
6881    if (dirty & IRIS_DIRTY_SF_CL_VIEWPORT) {
6882       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6883       uint32_t sf_cl_vp_address;
6884       uint32_t *vp_map =
6885          stream_state(batch, ice->state.dynamic_uploader,
6886                       &ice->state.last_res.sf_cl_vp,
6887                       4 * ice->state.num_viewports *
6888                       GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
6889 
6890       for (unsigned i = 0; i < ice->state.num_viewports; i++) {
6891          const struct pipe_viewport_state *state = &ice->state.viewports[i];
6892          float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
6893 
6894          float vp_xmin = viewport_extent(state, 0, -1.0f);
6895          float vp_xmax = viewport_extent(state, 0,  1.0f);
6896          float vp_ymin = viewport_extent(state, 1, -1.0f);
6897          float vp_ymax = viewport_extent(state, 1,  1.0f);
6898 
6899          intel_calculate_guardband_size(0, cso_fb->width, 0, cso_fb->height,
6900                                         state->scale[0], state->scale[1],
6901                                         state->translate[0], state->translate[1],
6902                                         &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
6903 
6904          iris_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp) {
6905             vp.ViewportMatrixElementm00 = state->scale[0];
6906             vp.ViewportMatrixElementm11 = state->scale[1];
6907             vp.ViewportMatrixElementm22 = state->scale[2];
6908             vp.ViewportMatrixElementm30 = state->translate[0];
6909             vp.ViewportMatrixElementm31 = state->translate[1];
6910             vp.ViewportMatrixElementm32 = state->translate[2];
6911             vp.XMinClipGuardband = gb_xmin;
6912             vp.XMaxClipGuardband = gb_xmax;
6913             vp.YMinClipGuardband = gb_ymin;
6914             vp.YMaxClipGuardband = gb_ymax;
6915             vp.XMinViewPort = MAX2(vp_xmin, 0);
6916             vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1;
6917             vp.YMinViewPort = MAX2(vp_ymin, 0);
6918             vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1;
6919          }
6920 
6921          vp_map += GENX(SF_CLIP_VIEWPORT_length);
6922       }
6923 
6924       iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
6925          ptr.SFClipViewportPointer = sf_cl_vp_address;
6926       }
6927    }
6928 
6929    if (dirty & IRIS_DIRTY_URB) {
6930       for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6931          if (!ice->shaders.prog[i]) {
6932             ice->shaders.urb.cfg.size[i] = 1;
6933          } else {
6934             struct iris_vue_data *vue_data =
6935                iris_vue_data(ice->shaders.prog[i]);
6936             ice->shaders.urb.cfg.size[i] = vue_data->urb_entry_size;
6937          }
6938          assert(ice->shaders.urb.cfg.size[i] != 0);
6939       }
6940 
6941       genX(emit_urb_config)(batch,
6942                             ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL,
6943                             ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL);
6944    }
6945 
6946    if (dirty & IRIS_DIRTY_BLEND_STATE) {
6947       struct iris_blend_state *cso_blend = ice->state.cso_blend;
6948       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6949       struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
6950 
6951       bool color_blend_zero = false;
6952       bool alpha_blend_zero = false;
6953 
6954       /* Always write at least one BLEND_STATE - the final RT message will
6955        * reference BLEND_STATE[0] even if there aren't color writes.  There
6956        * may still be alpha testing, computed depth, and so on.
6957        */
6958       const int rt_dwords =
6959          MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
6960 
6961       uint32_t blend_offset;
6962       uint32_t *blend_map =
6963          stream_state(batch, ice->state.dynamic_uploader,
6964                       &ice->state.last_res.blend,
6965                       96, 64, &blend_offset);
6966 
6967       /* Copy of blend entries for merging dynamic changes. */
6968       uint32_t blend_entries[4 * rt_dwords];
6969       memcpy(blend_entries, &cso_blend->blend_state[1], sizeof(blend_entries));
6970 
6971       unsigned cbufs = MAX2(cso_fb->nr_cbufs, 1);
6972 
6973       uint32_t *blend_entry = blend_entries;
6974       for (unsigned i = 0; i < cbufs; i++) {
6975          int dst_blend_factor = cso_blend->ps_dst_blend_factor[i];
6976          int dst_alpha_blend_factor = cso_blend->ps_dst_alpha_blend_factor[i];
6977          uint32_t entry[GENX(BLEND_STATE_ENTRY_length)];
6978          iris_pack_state(GENX(BLEND_STATE_ENTRY), entry, be) {
6979             if (needs_wa_14018912822) {
6980                if (dst_blend_factor == BLENDFACTOR_ZERO) {
6981                   dst_blend_factor = BLENDFACTOR_CONST_COLOR;
6982                   color_blend_zero = true;
6983                }
6984                if (dst_alpha_blend_factor == BLENDFACTOR_ZERO) {
6985                   dst_alpha_blend_factor = BLENDFACTOR_CONST_ALPHA;
6986                   alpha_blend_zero = true;
6987                }
6988             }
6989             be.DestinationBlendFactor = dst_blend_factor;
6990             be.DestinationAlphaBlendFactor = dst_alpha_blend_factor;
6991          }
6992 
6993          /* Merge entry. */
6994          uint32_t *dst = blend_entry;
6995          uint32_t *src = entry;
6996          for (unsigned j = 0; j < GENX(BLEND_STATE_ENTRY_length); j++)
6997             *dst |= *src;
6998 
6999          blend_entry += GENX(BLEND_STATE_ENTRY_length);
7000       }
7001 
7002       /* Blend constants modified for Wa_14018912822. */
7003       if (ice->state.color_blend_zero != color_blend_zero) {
7004          ice->state.color_blend_zero = color_blend_zero;
7005          ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
7006       }
7007       if (ice->state.alpha_blend_zero != alpha_blend_zero) {
7008          ice->state.alpha_blend_zero = alpha_blend_zero;
7009          ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
7010       }
7011 
7012       uint32_t blend_state_header;
7013       iris_pack_state(GENX(BLEND_STATE), &blend_state_header, bs) {
7014          bs.AlphaTestEnable = cso_zsa->alpha_enabled;
7015          bs.AlphaTestFunction = translate_compare_func(cso_zsa->alpha_func);
7016       }
7017 
7018       blend_map[0] = blend_state_header | cso_blend->blend_state[0];
7019       memcpy(&blend_map[1], blend_entries, 4 * rt_dwords);
7020 
7021       iris_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
7022          ptr.BlendStatePointer = blend_offset;
7023          ptr.BlendStatePointerValid = true;
7024       }
7025    }
7026 
7027    if (dirty & IRIS_DIRTY_COLOR_CALC_STATE) {
7028       struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
7029 #if GFX_VER == 8
7030       struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
7031 #endif
7032       uint32_t cc_offset;
7033       void *cc_map =
7034          stream_state(batch, ice->state.dynamic_uploader,
7035                       &ice->state.last_res.color_calc,
7036                       sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
7037                       64, &cc_offset);
7038       iris_pack_state(GENX(COLOR_CALC_STATE), cc_map, cc) {
7039          cc.AlphaTestFormat = ALPHATEST_FLOAT32;
7040          cc.AlphaReferenceValueAsFLOAT32 = cso->alpha_ref_value;
7041          cc.BlendConstantColorRed   = ice->state.color_blend_zero ?
7042             0.0 : ice->state.blend_color.color[0];
7043          cc.BlendConstantColorGreen = ice->state.color_blend_zero ?
7044             0.0 : ice->state.blend_color.color[1];
7045          cc.BlendConstantColorBlue  = ice->state.color_blend_zero ?
7046             0.0 : ice->state.blend_color.color[2];
7047          cc.BlendConstantColorAlpha = ice->state.alpha_blend_zero ?
7048             0.0 : ice->state.blend_color.color[3];
7049 #if GFX_VER == 8
7050 	 cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
7051 	 cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
7052 #endif
7053       }
7054       iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
7055          ptr.ColorCalcStatePointer = cc_offset;
7056          ptr.ColorCalcStatePointerValid = true;
7057       }
7058    }
7059 
7060 #if GFX_VERx10 == 125
7061    if (dirty & (IRIS_DIRTY_RENDER_BUFFER | IRIS_DIRTY_DEPTH_BUFFER)) {
7062       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7063       unsigned tile_width, tile_height;
7064 
7065       ice->state.use_tbimr = batch->screen->driconf.enable_tbimr &&
7066          calculate_tile_dimensions(ice, &tile_width, &tile_height);
7067 
7068       if (ice->state.use_tbimr) {
7069          /* Use a batch size of 128 polygons per slice as recommended
7070           * by BSpec 68436 "TBIMR Programming".
7071           */
7072          const unsigned num_slices = screen->devinfo->num_slices;
7073          const unsigned batch_size = DIV_ROUND_UP(num_slices, 2) * 256;
7074 
7075          iris_emit_cmd(batch, GENX(3DSTATE_TBIMR_TILE_PASS_INFO), tbimr) {
7076             tbimr.TileRectangleHeight = tile_height;
7077             tbimr.TileRectangleWidth = tile_width;
7078             tbimr.VerticalTileCount = DIV_ROUND_UP(cso_fb->height, tile_height);
7079             tbimr.HorizontalTileCount = DIV_ROUND_UP(cso_fb->width, tile_width);
7080             tbimr.TBIMRBatchSize = util_logbase2(batch_size) - 5;
7081             tbimr.TileBoxCheck = true;
7082          }
7083       }
7084    }
7085 #endif
7086 
7087    /* Wa_1604061319
7088     *
7089     *    3DSTATE_CONSTANT_* needs to be programmed before BTP_*
7090     *
7091     * Testing shows that all the 3DSTATE_CONSTANT_XS need to be emitted if
7092     * any stage has a dirty binding table.
7093     */
7094    const bool emit_const_wa = GFX_VER >= 11 &&
7095       ((dirty & IRIS_DIRTY_RENDER_BUFFER) ||
7096        (stage_dirty & IRIS_ALL_STAGE_DIRTY_BINDINGS_FOR_RENDER));
7097 
7098 #if GFX_VER >= 12
7099    uint32_t nobuffer_stages = 0;
7100 #endif
7101 
7102    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7103       if (!(stage_dirty & (IRIS_STAGE_DIRTY_CONSTANTS_VS << stage)) &&
7104           !emit_const_wa)
7105          continue;
7106 
7107       struct iris_shader_state *shs = &ice->state.shaders[stage];
7108       struct iris_compiled_shader *shader = ice->shaders.prog[stage];
7109 
7110       if (!shader)
7111          continue;
7112 
7113       if (shs->sysvals_need_upload)
7114          upload_sysvals(ice, stage, NULL);
7115 
7116       struct push_bos push_bos = {};
7117       setup_constant_buffers(ice, batch, stage, &push_bos);
7118 
7119 #if GFX_VER >= 12
7120       /* If this stage doesn't have any push constants, emit it later in a
7121        * single CONSTANT_ALL packet with all the other stages.
7122        */
7123       if (push_bos.buffer_count == 0) {
7124          nobuffer_stages |= 1 << stage;
7125          continue;
7126       }
7127 
7128       /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
7129        * contains only 5 bits, so we can only use it for buffers smaller than
7130        * 32.
7131        *
7132        * According to Wa_16011448509, Gfx12.0 misinterprets some address bits
7133        * in 3DSTATE_CONSTANT_ALL.  It should still be safe to use the command
7134        * for disabling stages, where all address bits are zero.  However, we
7135        * can't safely use it for general buffers with arbitrary addresses.
7136        * Just fall back to the individual 3DSTATE_CONSTANT_XS commands in that
7137        * case.
7138        */
7139       if (push_bos.max_length < 32 && GFX_VERx10 > 120) {
7140          emit_push_constant_packet_all(ice, batch, 1 << stage, &push_bos);
7141          continue;
7142       }
7143 #endif
7144       emit_push_constant_packets(ice, batch, stage, &push_bos);
7145    }
7146 
7147 #if GFX_VER >= 12
7148    if (nobuffer_stages)
7149       /* Wa_16011448509: all address bits are zero */
7150       emit_push_constant_packet_all(ice, batch, nobuffer_stages, NULL);
7151 #endif
7152 
7153    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7154       /* Gfx9 requires 3DSTATE_BINDING_TABLE_POINTERS_XS to be re-emitted
7155        * in order to commit constants.  TODO: Investigate "Disable Gather
7156        * at Set Shader" to go back to legacy mode...
7157        */
7158       if (stage_dirty & ((IRIS_STAGE_DIRTY_BINDINGS_VS |
7159                           (GFX_VER == 9 ? IRIS_STAGE_DIRTY_CONSTANTS_VS : 0))
7160                             << stage)) {
7161          iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
7162             ptr._3DCommandSubOpcode = 38 + stage;
7163             ptr.PointertoVSBindingTable =
7164                binder->bt_offset[stage] >> IRIS_BT_OFFSET_SHIFT;
7165          }
7166       }
7167    }
7168 
7169    if (GFX_VER >= 11 && (dirty & IRIS_DIRTY_RENDER_BUFFER)) {
7170       // XXX: we may want to flag IRIS_DIRTY_MULTISAMPLE (or SAMPLE_MASK?)
7171       // XXX: see commit 979fc1bc9bcc64027ff2cfafd285676f31b930a6
7172 
7173       /* The PIPE_CONTROL command description says:
7174        *
7175        *   "Whenever a Binding Table Index (BTI) used by a Render Target
7176        *    Message points to a different RENDER_SURFACE_STATE, SW must issue a
7177        *    Render Target Cache Flush by enabling this bit. When render target
7178        *    flush is set due to new association of BTI, PS Scoreboard Stall bit
7179        *    must be set in this packet."
7180        */
7181       // XXX: does this need to happen at 3DSTATE_BTP_PS time?
7182       iris_emit_pipe_control_flush(batch, "workaround: RT BTI change [draw]",
7183                                    PIPE_CONTROL_RENDER_TARGET_FLUSH |
7184                                    PIPE_CONTROL_STALL_AT_SCOREBOARD);
7185    }
7186 
7187    if (dirty & IRIS_DIRTY_RENDER_BUFFER)
7188       trace_framebuffer_state(&batch->trace, NULL, &ice->state.framebuffer);
7189 
7190    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7191       if (stage_dirty & (IRIS_STAGE_DIRTY_BINDINGS_VS << stage)) {
7192          iris_populate_binding_table(ice, batch, stage, false);
7193       }
7194    }
7195 
7196    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7197       if (!(stage_dirty & (IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
7198           !ice->shaders.prog[stage])
7199          continue;
7200 
7201       iris_upload_sampler_states(ice, stage);
7202 
7203       struct iris_shader_state *shs = &ice->state.shaders[stage];
7204       struct pipe_resource *res = shs->sampler_table.res;
7205       if (res)
7206          iris_use_pinned_bo(batch, iris_resource_bo(res), false,
7207                             IRIS_DOMAIN_NONE);
7208 
7209       iris_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
7210          ptr._3DCommandSubOpcode = 43 + stage;
7211          ptr.PointertoVSSamplerState = shs->sampler_table.offset;
7212       }
7213    }
7214 
7215    if (ice->state.need_border_colors)
7216       iris_use_pinned_bo(batch, border_color_pool->bo, false, IRIS_DOMAIN_NONE);
7217 
7218    if (dirty & IRIS_DIRTY_MULTISAMPLE) {
7219       iris_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
7220          ms.PixelLocation =
7221             ice->state.cso_rast->half_pixel_center ? CENTER : UL_CORNER;
7222          if (ice->state.framebuffer.samples > 0)
7223             ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
7224       }
7225    }
7226 
7227    if (dirty & IRIS_DIRTY_SAMPLE_MASK) {
7228       iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
7229          ms.SampleMask = ice->state.sample_mask;
7230       }
7231    }
7232 
7233 #if GFX_VERx10 >= 125
7234    /* This is only used on >= gfx125 for dynamic 3DSTATE_TE and
7235     * 3DSTATE_VFG emission related workarounds.
7236     */
7237    bool program_uses_primitive_id = false;
7238 
7239    /* Check if FS stage will use primitive ID overrides. */
7240    const struct intel_vue_map *last_vue_map =
7241       &iris_vue_data(ice->shaders.last_vue_shader)->vue_map;
7242    if ((fs_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
7243        last_vue_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1) {
7244       program_uses_primitive_id = true;
7245    }
7246 #endif
7247 
7248    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7249       if (!(stage_dirty & (IRIS_STAGE_DIRTY_VS << stage)))
7250          continue;
7251 
7252       struct iris_compiled_shader *shader = ice->shaders.prog[stage];
7253 
7254       if (shader) {
7255          struct iris_resource *cache = (void *) shader->assembly.res;
7256          iris_use_pinned_bo(batch, cache->bo, false, IRIS_DOMAIN_NONE);
7257 
7258          uint32_t scratch_addr =
7259             pin_scratch_space(ice, batch, shader, stage);
7260 
7261 #if GFX_VERx10 >= 125
7262          shader_program_uses_primitive_id(ice, batch, shader, stage,
7263                                           &program_uses_primitive_id);
7264 #endif
7265 
7266          if (stage == MESA_SHADER_FRAGMENT) {
7267             UNUSED struct iris_rasterizer_state *cso = ice->state.cso_rast;
7268             struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7269 
7270             uint32_t ps_state[GENX(3DSTATE_PS_length)] = {0};
7271             _iris_pack_command(batch, GENX(3DSTATE_PS), ps_state, ps) {
7272 #if GFX_VER >= 9
7273                struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(shader->brw_prog_data);
7274 #else
7275                struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(shader->elk_prog_data);
7276 #endif
7277                intel_set_ps_dispatch_state(&ps, batch->screen->devinfo,
7278                                            wm_prog_data, util_framebuffer_get_num_samples(cso_fb),
7279                                            0 /* msaa_flags */);
7280 
7281 #if GFX_VER == 12
7282                assert(fs_data->dispatch_multi == 0 ||
7283                       (fs_data->dispatch_multi == 16 && fs_data->max_polygons == 2));
7284                ps.DualSIMD8DispatchEnable = fs_data->dispatch_multi;
7285                /* XXX - No major improvement observed from enabling
7286                 *       overlapping subspans, but it could be helpful
7287                 *       in theory when the requirements listed on the
7288                 *       BSpec page for 3DSTATE_PS_BODY are met.
7289                 */
7290                ps.OverlappingSubspansEnable = false;
7291 #endif
7292 
7293 #if GFX_VER >= 9
7294                ps.DispatchGRFStartRegisterForConstantSetupData0 =
7295                   brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
7296                ps.DispatchGRFStartRegisterForConstantSetupData1 =
7297                   brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
7298 #if GFX_VER < 20
7299                ps.DispatchGRFStartRegisterForConstantSetupData2 =
7300                   brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
7301 #endif
7302 
7303                ps.KernelStartPointer0 = KSP(shader) +
7304                   brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
7305                ps.KernelStartPointer1 = KSP(shader) +
7306                   brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
7307 #if GFX_VER < 20
7308                ps.KernelStartPointer2 = KSP(shader) +
7309                   brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
7310 #endif
7311 #else
7312                ps.DispatchGRFStartRegisterForConstantSetupData0 =
7313                   elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
7314                ps.DispatchGRFStartRegisterForConstantSetupData1 =
7315                   elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
7316                ps.DispatchGRFStartRegisterForConstantSetupData2 =
7317                   elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
7318 
7319                ps.KernelStartPointer0 = KSP(shader) +
7320                   elk_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
7321                ps.KernelStartPointer1 = KSP(shader) +
7322                   elk_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
7323                ps.KernelStartPointer2 = KSP(shader) +
7324                   elk_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
7325 #endif
7326 
7327 #if GFX_VERx10 >= 125
7328                ps.ScratchSpaceBuffer = scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;
7329 #else
7330                ps.ScratchSpaceBasePointer =
7331                   rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);
7332 #endif
7333             }
7334 
7335             uint32_t psx_state[GENX(3DSTATE_PS_EXTRA_length)] = {0};
7336             iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
7337 #if GFX_VER >= 9
7338                if (!fs_data->uses_sample_mask)
7339                   psx.InputCoverageMaskState  = ICMS_NONE;
7340                else if (fs_data->post_depth_coverage)
7341                   psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
7342                else if (fs_data->inner_coverage &&
7343                         cso->conservative_rasterization)
7344                   psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;
7345                else
7346                   psx.InputCoverageMaskState = ICMS_NORMAL;
7347 #else
7348                psx.PixelShaderUsesInputCoverageMask =
7349                   fs_data->uses_sample_mask;
7350 #endif
7351             }
7352 
7353             uint32_t *shader_ps = (uint32_t *) shader->derived_data;
7354             uint32_t *shader_psx = shader_ps + GENX(3DSTATE_PS_length);
7355             iris_emit_merge(batch, shader_ps, ps_state,
7356                             GENX(3DSTATE_PS_length));
7357             iris_emit_merge(batch, shader_psx, psx_state,
7358                             GENX(3DSTATE_PS_EXTRA_length));
7359 #if GFX_VERx10 >= 125
7360          } else if (stage == MESA_SHADER_TESS_EVAL) {
7361             uint32_t te_state[GENX(3DSTATE_TE_length)] = { 0 };
7362             iris_pack_command(GENX(3DSTATE_TE), te_state, te) {
7363                if (intel_needs_workaround(screen->devinfo, 14015055625) &&
7364                    program_uses_primitive_id)
7365                   te.TessellationDistributionMode = TEDMODE_OFF;
7366                else if (intel_needs_workaround(screen->devinfo, 22012699309))
7367                   te.TessellationDistributionMode = TEDMODE_RR_STRICT;
7368                else
7369                   te.TessellationDistributionMode = TEDMODE_RR_FREE;
7370             }
7371 
7372             uint32_t ds_state[GENX(3DSTATE_DS_length)] = { 0 };
7373             iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
7374                if (scratch_addr)
7375                   ds.ScratchSpaceBuffer =
7376                      scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;
7377             }
7378 
7379             uint32_t *shader_ds = (uint32_t *) shader->derived_data;
7380             uint32_t *shader_te = shader_ds + GENX(3DSTATE_DS_length);
7381 
7382             iris_emit_merge(batch, shader_ds, ds_state,
7383                             GENX(3DSTATE_DS_length));
7384             iris_emit_merge(batch, shader_te, te_state,
7385                             GENX(3DSTATE_TE_length));
7386 #endif
7387          } else if (scratch_addr) {
7388             uint32_t *pkt = (uint32_t *) shader->derived_data;
7389             switch (stage) {
7390             case MESA_SHADER_VERTEX:    MERGE_SCRATCH_ADDR(3DSTATE_VS); break;
7391             case MESA_SHADER_TESS_CTRL: MERGE_SCRATCH_ADDR(3DSTATE_HS); break;
7392             case MESA_SHADER_TESS_EVAL: {
7393                uint32_t *shader_ds = (uint32_t *) shader->derived_data;
7394                uint32_t *shader_te = shader_ds + GENX(3DSTATE_DS_length);
7395                iris_batch_emit(batch, shader_te, 4 * GENX(3DSTATE_TE_length));
7396                MERGE_SCRATCH_ADDR(3DSTATE_DS);
7397                break;
7398             }
7399             case MESA_SHADER_GEOMETRY:  MERGE_SCRATCH_ADDR(3DSTATE_GS); break;
7400             }
7401          } else {
7402             iris_batch_emit(batch, shader->derived_data,
7403                             iris_derived_program_state_size(stage));
7404          }
7405       } else {
7406          if (stage == MESA_SHADER_TESS_EVAL) {
7407             iris_emit_cmd(batch, GENX(3DSTATE_HS), hs);
7408             iris_emit_cmd(batch, GENX(3DSTATE_TE), te);
7409             iris_emit_cmd(batch, GENX(3DSTATE_DS), ds);
7410          } else if (stage == MESA_SHADER_GEOMETRY) {
7411             iris_emit_cmd(batch, GENX(3DSTATE_GS), gs);
7412          }
7413       }
7414    }
7415 
7416 #if GFX_VERx10 >= 125
7417    /* Inspect program_uses_primitive_id state and dirty VFG if required. */
7418    if (intel_needs_workaround(batch->screen->devinfo, 14019166699) &&
7419        program_uses_primitive_id != ice->state.uses_primitive_id) {
7420       dirty |= IRIS_DIRTY_VFG;
7421       ice->state.uses_primitive_id = program_uses_primitive_id;
7422    }
7423 #endif
7424 
7425    if (ice->state.streamout_active) {
7426       if (dirty & IRIS_DIRTY_SO_BUFFERS) {
7427          /* Wa_16011411144
7428           * SW must insert a PIPE_CONTROL cmd before and after the
7429           * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_* state is
7430           * not combined with other state changes.
7431           */
7432          if (intel_device_info_is_dg2(batch->screen->devinfo)) {
7433             iris_emit_pipe_control_flush(batch,
7434                                          "SO pre change stall WA",
7435                                          PIPE_CONTROL_CS_STALL);
7436          }
7437 
7438          for (int i = 0; i < 4; i++) {
7439             struct iris_stream_output_target *tgt =
7440                (void *) ice->state.so_target[i];
7441             enum { dwords = GENX(3DSTATE_SO_BUFFER_length) };
7442             uint32_t *so_buffers = genx->so_buffers + i * dwords;
7443             bool zero_offset = false;
7444 
7445             if (tgt) {
7446                zero_offset = tgt->zero_offset;
7447                iris_use_pinned_bo(batch, iris_resource_bo(tgt->base.buffer),
7448                                   true, IRIS_DOMAIN_OTHER_WRITE);
7449                iris_use_pinned_bo(batch, iris_resource_bo(tgt->offset.res),
7450                                   true, IRIS_DOMAIN_OTHER_WRITE);
7451             }
7452 
7453             if (zero_offset) {
7454                /* Skip the last DWord which contains "Stream Offset" of
7455                 * 0xFFFFFFFF and instead emit a dword of zero directly.
7456                 */
7457                STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_StreamOffset_start) ==
7458                              32 * (dwords - 1));
7459                const uint32_t zero = 0;
7460                iris_batch_emit(batch, so_buffers, 4 * (dwords - 1));
7461                iris_batch_emit(batch, &zero, sizeof(zero));
7462                tgt->zero_offset = false;
7463             } else {
7464                iris_batch_emit(batch, so_buffers, 4 * dwords);
7465             }
7466          }
7467 
7468          /* Wa_16011411144 */
7469          if (intel_device_info_is_dg2(batch->screen->devinfo)) {
7470             iris_emit_pipe_control_flush(batch,
7471                                          "SO post change stall WA",
7472                                          PIPE_CONTROL_CS_STALL);
7473          }
7474       }
7475 
7476       if ((dirty & IRIS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
7477          /* Wa_16011773973:
7478           * If SOL is enabled and SO_DECL state has to be programmed,
7479           *    1. Send 3D State SOL state with SOL disabled
7480           *    2. Send SO_DECL NP state
7481           *    3. Send 3D State SOL with SOL Enabled
7482           */
7483          if (intel_device_info_is_dg2(batch->screen->devinfo))
7484             iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
7485 
7486          uint32_t *decl_list =
7487             ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
7488          iris_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
7489 
7490 #if GFX_VER >= 11 && GFX_VER < 20
7491          /* ICL PRMs, Volume 2a - Command Reference: Instructions,
7492           * 3DSTATE_SO_DECL_LIST:
7493           *
7494           *    "Workaround: This command must be followed by a PIPE_CONTROL
7495           *     with CS Stall bit set."
7496           *
7497           * On DG2+ also known as Wa_1509820217.
7498           */
7499          iris_emit_pipe_control_flush(batch,
7500                                       "workaround: cs stall after so_decl",
7501                                       PIPE_CONTROL_CS_STALL);
7502 #endif
7503       }
7504 
7505       if (dirty & IRIS_DIRTY_STREAMOUT) {
7506          const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
7507 
7508 #if GFX_VERx10 >= 120
7509          /* Wa_16013994831 - Disable preemption. */
7510          if (intel_needs_workaround(batch->screen->devinfo, 16013994831))
7511             iris_preemption_streamout_wa(ice, batch, false);
7512 #endif
7513 
7514          uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
7515          iris_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
7516             sol.SOFunctionEnable = true;
7517             sol.SOStatisticsEnable = true;
7518 
7519             sol.RenderingDisable = cso_rast->rasterizer_discard &&
7520                                    !ice->state.prims_generated_query_active;
7521             sol.ReorderMode = cso_rast->flatshade_first ? LEADING : TRAILING;
7522 
7523 
7524 #if INTEL_NEEDS_WA_18022508906
7525             /* Wa_14017076903 :
7526              *
7527              * SKL PRMs, Volume 7: 3D-Media-GPGPU, Stream Output Logic (SOL) Stage:
7528              *
7529              * SOL_INT::Render_Enable =
7530              *   (3DSTATE_STREAMOUT::Force_Rending == Force_On) ||
7531              *   (
7532              *     (3DSTATE_STREAMOUT::Force_Rending != Force_Off) &&
7533              *     !(3DSTATE_GS::Enable && 3DSTATE_GS::Output Vertex Size == 0) &&
7534              *     !3DSTATE_STREAMOUT::API_Render_Disable &&
7535              *     (
7536              *       3DSTATE_DEPTH_STENCIL_STATE::Stencil_TestEnable ||
7537              *       3DSTATE_DEPTH_STENCIL_STATE::Depth_TestEnable ||
7538              *       3DSTATE_DEPTH_STENCIL_STATE::Depth_WriteEnable ||
7539              *       3DSTATE_PS_EXTRA::PS_Valid ||
7540              *       3DSTATE_WM::Legacy Depth_Buffer_Clear ||
7541              *       3DSTATE_WM::Legacy Depth_Buffer_Resolve_Enable ||
7542              *       3DSTATE_WM::Legacy Hierarchical_Depth_Buffer_Resolve_Enable
7543              *     )
7544              *   )
7545              *
7546              * If SOL_INT::Render_Enable is false, the SO stage will not forward any
7547              * topologies down the pipeline. Which is not what we want for occlusion
7548              * queries.
7549              *
7550              * Here we force rendering to get SOL_INT::Render_Enable when occlusion
7551              * queries are active.
7552              */
7553             const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
7554             if (!cso_rast->rasterizer_discard && ice->state.occlusion_query_active)
7555                sol.ForceRendering = Force_on;
7556 #endif
7557          }
7558 
7559          assert(ice->state.streamout);
7560 
7561          iris_emit_merge(batch, ice->state.streamout, dynamic_sol,
7562                          GENX(3DSTATE_STREAMOUT_length));
7563       }
7564    } else {
7565       if (dirty & IRIS_DIRTY_STREAMOUT) {
7566 
7567 #if GFX_VERx10 >= 120
7568          /* Wa_16013994831 - Enable preemption. */
7569          if (!ice->state.genx->object_preemption)
7570             iris_preemption_streamout_wa(ice, batch, true);
7571 #endif
7572 
7573          iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
7574       }
7575    }
7576 
7577    if (dirty & IRIS_DIRTY_CLIP) {
7578       struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
7579       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7580 
7581       bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
7582                        ice->shaders.prog[MESA_SHADER_TESS_EVAL];
7583       bool points_or_lines = cso_rast->fill_mode_point_or_line ||
7584          (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
7585                     : ice->state.prim_is_points_or_lines);
7586       const struct intel_vue_map *last =
7587          &iris_vue_data(ice->shaders.last_vue_shader)->vue_map;
7588 
7589       uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
7590       iris_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
7591          cl.StatisticsEnable = ice->state.statistics_counters_enabled;
7592          if (cso_rast->rasterizer_discard)
7593             cl.ClipMode = CLIPMODE_REJECT_ALL;
7594          else if (ice->state.window_space_position)
7595             cl.ClipMode = CLIPMODE_ACCEPT_ALL;
7596          else
7597             cl.ClipMode = CLIPMODE_NORMAL;
7598 
7599          cl.PerspectiveDivideDisable = ice->state.window_space_position;
7600          cl.ViewportXYClipTestEnable = !points_or_lines;
7601 
7602          cl.NonPerspectiveBarycentricEnable = fs_data->uses_nonperspective_interp_modes;
7603 
7604          cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1 ||
7605                                       !(last->slots_valid & VARYING_BIT_LAYER);
7606          cl.MaximumVPIndex = ice->state.num_viewports - 1;
7607       }
7608       iris_emit_merge(batch, cso_rast->clip, dynamic_clip,
7609                       ARRAY_SIZE(cso_rast->clip));
7610    }
7611 
7612    if (dirty & (IRIS_DIRTY_RASTER | IRIS_DIRTY_URB)) {
7613       /* From the Browadwell PRM, Volume 2, documentation for
7614        * 3DSTATE_RASTER, "Antialiasing Enable":
7615        *
7616        * "This field must be disabled if any of the render targets
7617        * have integer (UINT or SINT) surface format."
7618        *
7619        * Additionally internal documentation for Gfx12+ states:
7620        *
7621        * "This bit MUST not be set when NUM_MULTISAMPLES > 1 OR
7622        *  FORCED_SAMPLE_COUNT > 1."
7623        */
7624       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7625       unsigned samples = util_framebuffer_get_num_samples(cso_fb);
7626       struct iris_rasterizer_state *cso = ice->state.cso_rast;
7627 
7628       bool aa_enable = cso->line_smooth &&
7629                        !ice->state.has_integer_rt &&
7630                        !(batch->screen->devinfo->ver >= 12 && samples > 1);
7631 
7632       uint32_t dynamic_raster[GENX(3DSTATE_RASTER_length)];
7633       iris_pack_command(GENX(3DSTATE_RASTER), &dynamic_raster, raster) {
7634          raster.AntialiasingEnable = aa_enable;
7635       }
7636       iris_emit_merge(batch, cso->raster, dynamic_raster,
7637                       ARRAY_SIZE(cso->raster));
7638 
7639       uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
7640       iris_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
7641          sf.ViewportTransformEnable = !ice->state.window_space_position;
7642 
7643 #if GFX_VER >= 12
7644          sf.DerefBlockSize = ice->state.urb_deref_block_size;
7645 #endif
7646       }
7647       iris_emit_merge(batch, cso->sf, dynamic_sf,
7648                       ARRAY_SIZE(dynamic_sf));
7649    }
7650 
7651    if (dirty & IRIS_DIRTY_WM) {
7652       struct iris_rasterizer_state *cso = ice->state.cso_rast;
7653       uint32_t dynamic_wm[GENX(3DSTATE_WM_length)];
7654 
7655       iris_pack_command(GENX(3DSTATE_WM), &dynamic_wm, wm) {
7656          wm.StatisticsEnable = ice->state.statistics_counters_enabled;
7657 
7658          wm.BarycentricInterpolationMode =
7659             iris_fs_barycentric_modes(ice->shaders.prog[MESA_SHADER_FRAGMENT], 0);
7660 
7661          if (fs_data->early_fragment_tests)
7662             wm.EarlyDepthStencilControl = EDSC_PREPS;
7663          else if (fs_data->has_side_effects)
7664             wm.EarlyDepthStencilControl = EDSC_PSEXEC;
7665          else
7666             wm.EarlyDepthStencilControl = EDSC_NORMAL;
7667 
7668          /* We could skip this bit if color writes are enabled. */
7669          if (fs_data->has_side_effects || fs_data->uses_kill)
7670             wm.ForceThreadDispatchEnable = ForceON;
7671       }
7672       iris_emit_merge(batch, cso->wm, dynamic_wm, ARRAY_SIZE(cso->wm));
7673    }
7674 
7675    if (dirty & IRIS_DIRTY_SBE) {
7676       iris_emit_sbe(batch, ice);
7677    }
7678 
7679    if (dirty & IRIS_DIRTY_PS_BLEND) {
7680       struct iris_blend_state *cso_blend = ice->state.cso_blend;
7681       struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
7682       const struct shader_info *fs_info =
7683          iris_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7684 
7685       int dst_blend_factor = cso_blend->ps_dst_blend_factor[0];
7686       int dst_alpha_blend_factor = cso_blend->ps_dst_alpha_blend_factor[0];
7687 
7688       /* When MSAA is enabled, instead of using BLENDFACTOR_ZERO use
7689        * CONST_COLOR, CONST_ALPHA and supply zero by using blend constants.
7690        */
7691       if (needs_wa_14018912822) {
7692          if (ice->state.color_blend_zero)
7693             dst_blend_factor = BLENDFACTOR_CONST_COLOR;
7694          if (ice->state.alpha_blend_zero)
7695             dst_alpha_blend_factor = BLENDFACTOR_CONST_ALPHA;
7696       }
7697 
7698       uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)];
7699       iris_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) {
7700          pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info);
7701          pb.AlphaTestEnable = cso_zsa->alpha_enabled;
7702 
7703          pb.DestinationBlendFactor = dst_blend_factor;
7704          pb.DestinationAlphaBlendFactor = dst_alpha_blend_factor;
7705 
7706          /* The dual source blending docs caution against using SRC1 factors
7707           * when the shader doesn't use a dual source render target write.
7708           * Empirically, this can lead to GPU hangs, and the results are
7709           * undefined anyway, so simply disable blending to avoid the hang.
7710           */
7711          pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) &&
7712             (!cso_blend->dual_color_blending || fs_data->dual_src_blend);
7713       }
7714 
7715       iris_emit_merge(batch, cso_blend->ps_blend, dynamic_pb,
7716                       ARRAY_SIZE(cso_blend->ps_blend));
7717    }
7718 
7719    if (dirty & IRIS_DIRTY_WM_DEPTH_STENCIL) {
7720       struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
7721 #if GFX_VER >= 9 && GFX_VER < 12
7722       struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
7723       uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
7724       iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {
7725          wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];
7726          wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
7727       }
7728       iris_emit_merge(batch, cso->wmds, stencil_refs, ARRAY_SIZE(cso->wmds));
7729 #else
7730       /* Use modify disable fields which allow us to emit packets
7731        * directly instead of merging them later.
7732        */
7733       iris_batch_emit(batch, cso->wmds, sizeof(cso->wmds));
7734 #endif
7735 
7736    /* Depth or stencil write changed in cso. */
7737    if (intel_needs_workaround(batch->screen->devinfo, 18019816803) &&
7738        (dirty & IRIS_DIRTY_DS_WRITE_ENABLE)) {
7739       iris_emit_pipe_control_flush(
7740          batch, "workaround: PSS stall after DS write enable change",
7741          PIPE_CONTROL_PSS_STALL_SYNC);
7742    }
7743 
7744 #if GFX_VER >= 12
7745       iris_batch_emit(batch, cso->depth_bounds, sizeof(cso->depth_bounds));
7746 #endif
7747    }
7748 
7749    if (dirty & IRIS_DIRTY_STENCIL_REF) {
7750 #if GFX_VER >= 12
7751       /* Use modify disable fields which allow us to emit packets
7752        * directly instead of merging them later.
7753        */
7754       struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
7755       uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
7756       iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {
7757          wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];
7758          wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
7759          wmds.StencilTestMaskModifyDisable = true;
7760          wmds.StencilWriteMaskModifyDisable = true;
7761          wmds.StencilStateModifyDisable = true;
7762          wmds.DepthStateModifyDisable = true;
7763       }
7764       iris_batch_emit(batch, stencil_refs, sizeof(stencil_refs));
7765 #endif
7766    }
7767 
7768    if (dirty & IRIS_DIRTY_SCISSOR_RECT) {
7769       /* Wa_1409725701:
7770        *    "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
7771        *    stored as an array of up to 16 elements. The location of first
7772        *    element of the array, as specified by Pointer to SCISSOR_RECT,
7773        *    should be aligned to a 64-byte boundary.
7774        */
7775       uint32_t alignment = 64;
7776       uint32_t scissor_offset =
7777          emit_state(batch, ice->state.dynamic_uploader,
7778                     &ice->state.last_res.scissor,
7779                     ice->state.scissors,
7780                     sizeof(struct pipe_scissor_state) *
7781                     ice->state.num_viewports, alignment);
7782 
7783       iris_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
7784          ptr.ScissorRectPointer = scissor_offset;
7785       }
7786    }
7787 
7788    if (dirty & IRIS_DIRTY_DEPTH_BUFFER) {
7789       struct iris_depth_buffer_state *cso_z = &ice->state.genx->depth_buffer;
7790 
7791       /* Do not emit the cso yet. We may need to update clear params first. */
7792       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7793       struct iris_resource *zres = NULL, *sres = NULL;
7794       if (cso_fb->zsbuf) {
7795          iris_get_depth_stencil_resources(cso_fb->zsbuf->texture,
7796                                           &zres, &sres);
7797       }
7798 
7799       if (zres && ice->state.hiz_usage != ISL_AUX_USAGE_NONE) {
7800 #if GFX_VER < 20
7801          uint32_t *clear_params =
7802             cso_z->packets + ARRAY_SIZE(cso_z->packets) -
7803             GENX(3DSTATE_CLEAR_PARAMS_length);
7804 
7805          iris_pack_command(GENX(3DSTATE_CLEAR_PARAMS), clear_params, clear) {
7806             clear.DepthClearValueValid = true;
7807             clear.DepthClearValue = zres->aux.clear_color.f32[0];
7808          }
7809 #endif
7810       }
7811 
7812       iris_batch_emit(batch, cso_z->packets, sizeof(cso_z->packets));
7813 
7814       if (intel_needs_workaround(batch->screen->devinfo, 1408224581) ||
7815           intel_needs_workaround(batch->screen->devinfo, 14014097488) ||
7816           intel_needs_workaround(batch->screen->devinfo, 14016712196)) {
7817          /* Wa_1408224581
7818           *
7819           * Workaround: Gfx12LP Astep only An additional pipe control with
7820           * post-sync = store dword operation would be required.( w/a is to
7821           * have an additional pipe control after the stencil state whenever
7822           * the surface state bits of this state is changing).
7823           *
7824           * This also seems sufficient to handle Wa_14014097488 and
7825           * Wa_14016712196.
7826           */
7827          iris_emit_pipe_control_write(batch, "WA for depth/stencil state",
7828                                       PIPE_CONTROL_WRITE_IMMEDIATE,
7829                                       screen->workaround_address.bo,
7830                                       screen->workaround_address.offset, 0);
7831       }
7832 
7833       if (zres)
7834          genX(emit_depth_state_workarounds)(ice, batch, &zres->surf);
7835    }
7836 
7837    if (dirty & (IRIS_DIRTY_DEPTH_BUFFER | IRIS_DIRTY_WM_DEPTH_STENCIL)) {
7838       /* Listen for buffer changes, and also write enable changes. */
7839       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7840       pin_depth_and_stencil_buffers(batch, cso_fb->zsbuf, ice->state.cso_zsa);
7841    }
7842 
7843    if (dirty & IRIS_DIRTY_POLYGON_STIPPLE) {
7844       iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
7845          for (int i = 0; i < 32; i++) {
7846             poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
7847          }
7848       }
7849    }
7850 
7851    if (dirty & IRIS_DIRTY_LINE_STIPPLE) {
7852       struct iris_rasterizer_state *cso = ice->state.cso_rast;
7853       iris_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
7854 #if GFX_VER >= 11
7855       /* ICL PRMs, Volume 2a - Command Reference: Instructions,
7856        * 3DSTATE_LINE_STIPPLE:
7857        *
7858        *    "Workaround: This command must be followed by a PIPE_CONTROL with
7859        *     CS Stall bit set."
7860        */
7861       iris_emit_pipe_control_flush(batch,
7862                                    "workaround: post 3DSTATE_LINE_STIPPLE",
7863                                    PIPE_CONTROL_CS_STALL);
7864 #endif
7865    }
7866 
7867    if (dirty & IRIS_DIRTY_VF_TOPOLOGY) {
7868       iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
7869          topo.PrimitiveTopologyType =
7870             translate_prim_type(draw->mode, ice->state.vertices_per_patch);
7871       }
7872    }
7873 
7874    if (dirty & IRIS_DIRTY_VERTEX_BUFFERS) {
7875       int count = util_bitcount64(ice->state.bound_vertex_buffers);
7876       uint64_t dynamic_bound = ice->state.bound_vertex_buffers;
7877 
7878       if (ice->state.vs_uses_draw_params && !skip_vb_params) {
7879          assert(ice->draw.draw_params.res);
7880 
7881          struct iris_vertex_buffer_state *state =
7882             &(ice->state.genx->vertex_buffers[count]);
7883          pipe_resource_reference(&state->resource, ice->draw.draw_params.res);
7884          struct iris_resource *res = (void *) state->resource;
7885 
7886          iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
7887             vb.VertexBufferIndex = count;
7888             vb.AddressModifyEnable = true;
7889             vb.BufferPitch = 0;
7890             vb.BufferSize = res->bo->size - ice->draw.draw_params.offset;
7891             vb.BufferStartingAddress =
7892                ro_bo(NULL, res->bo->address +
7893                            (int) ice->draw.draw_params.offset);
7894             vb.MOCS = iris_mocs(res->bo, &screen->isl_dev,
7895                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
7896 #if GFX_VER >= 12
7897             vb.L3BypassDisable       = true;
7898 #endif
7899          }
7900          dynamic_bound |= 1ull << count;
7901          count++;
7902       }
7903 
7904       if (ice->state.vs_uses_derived_draw_params && !skip_vb_params) {
7905          struct iris_vertex_buffer_state *state =
7906             &(ice->state.genx->vertex_buffers[count]);
7907          pipe_resource_reference(&state->resource,
7908                                  ice->draw.derived_draw_params.res);
7909          struct iris_resource *res = (void *) ice->draw.derived_draw_params.res;
7910 
7911          iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
7912              vb.VertexBufferIndex = count;
7913             vb.AddressModifyEnable = true;
7914             vb.BufferPitch = 0;
7915             vb.BufferSize =
7916                res->bo->size - ice->draw.derived_draw_params.offset;
7917             vb.BufferStartingAddress =
7918                ro_bo(NULL, res->bo->address +
7919                            (int) ice->draw.derived_draw_params.offset);
7920             vb.MOCS = iris_mocs(res->bo, &screen->isl_dev,
7921                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
7922 #if GFX_VER >= 12
7923             vb.L3BypassDisable       = true;
7924 #endif
7925          }
7926          dynamic_bound |= 1ull << count;
7927          count++;
7928       }
7929 
7930       if (count) {
7931 #if GFX_VER >= 11
7932          /* Gfx11+ doesn't need the cache workaround below */
7933          uint64_t bound = dynamic_bound;
7934          while (bound) {
7935             const int i = u_bit_scan64(&bound);
7936             iris_use_optional_res(batch, genx->vertex_buffers[i].resource,
7937                                   false, IRIS_DOMAIN_VF_READ);
7938          }
7939 #else
7940          /* The VF cache designers cut corners, and made the cache key's
7941           * <VertexBufferIndex, Memory Address> tuple only consider the bottom
7942           * 32 bits of the address.  If you have two vertex buffers which get
7943           * placed exactly 4 GiB apart and use them in back-to-back draw calls,
7944           * you can get collisions (even within a single batch).
7945           *
7946           * So, we need to do a VF cache invalidate if the buffer for a VB
7947           * slot slot changes [48:32] address bits from the previous time.
7948           */
7949          unsigned flush_flags = 0;
7950 
7951          uint64_t bound = dynamic_bound;
7952          while (bound) {
7953             const int i = u_bit_scan64(&bound);
7954             uint16_t high_bits = 0;
7955 
7956             struct iris_resource *res =
7957                (void *) genx->vertex_buffers[i].resource;
7958             if (res) {
7959                iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_VF_READ);
7960 
7961                high_bits = res->bo->address >> 32ull;
7962                if (high_bits != ice->state.last_vbo_high_bits[i]) {
7963                   flush_flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE |
7964                                  PIPE_CONTROL_CS_STALL;
7965                   ice->state.last_vbo_high_bits[i] = high_bits;
7966                }
7967             }
7968          }
7969 
7970          if (flush_flags) {
7971             iris_emit_pipe_control_flush(batch,
7972                                          "workaround: VF cache 32-bit key [VB]",
7973                                          flush_flags);
7974          }
7975 #endif
7976 
7977          const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
7978 
7979          uint32_t *map =
7980             iris_get_command_space(batch, 4 * (1 + vb_dwords * count));
7981          _iris_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
7982             vb.DWordLength = (vb_dwords * count + 1) - 2;
7983          }
7984          map += 1;
7985 
7986          const struct iris_vertex_element_state *cso_ve =
7987             ice->state.cso_vertex_elements;
7988 
7989          bound = dynamic_bound;
7990          while (bound) {
7991             const int i = u_bit_scan64(&bound);
7992 
7993             uint32_t vb_stride[GENX(VERTEX_BUFFER_STATE_length)];
7994             struct iris_bo *bo =
7995                iris_resource_bo(genx->vertex_buffers[i].resource);
7996             iris_pack_state(GENX(VERTEX_BUFFER_STATE), &vb_stride, vbs) {
7997                vbs.BufferPitch = cso_ve->stride[i];
7998                /* Unnecessary except to defeat the genxml nonzero checker */
7999                vbs.MOCS = iris_mocs(bo, &screen->isl_dev,
8000                                     ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
8001             }
8002             for (unsigned d = 0; d < vb_dwords; d++)
8003                map[d] = genx->vertex_buffers[i].state[d] | vb_stride[d];
8004 
8005             map += vb_dwords;
8006          }
8007       }
8008    }
8009 
8010    if (dirty & IRIS_DIRTY_VERTEX_ELEMENTS) {
8011       struct iris_vertex_element_state *cso = ice->state.cso_vertex_elements;
8012       const unsigned entries = MAX2(cso->count, 1);
8013       if (!(ice->state.vs_needs_sgvs_element ||
8014             ice->state.vs_uses_derived_draw_params ||
8015             ice->state.vs_needs_edge_flag)) {
8016          iris_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
8017                          (1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
8018       } else {
8019          uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
8020          const unsigned dyn_count = cso->count +
8021             ice->state.vs_needs_sgvs_element +
8022             ice->state.vs_uses_derived_draw_params;
8023 
8024          iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
8025                            &dynamic_ves, ve) {
8026             ve.DWordLength =
8027                1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
8028          }
8029          memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
8030                 (cso->count - ice->state.vs_needs_edge_flag) *
8031                 GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
8032          uint32_t *ve_pack_dest =
8033             &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
8034                          GENX(VERTEX_ELEMENT_STATE_length)];
8035 
8036          if (ice->state.vs_needs_sgvs_element) {
8037             uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
8038                                  VFCOMP_STORE_SRC : VFCOMP_STORE_0;
8039             iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
8040                ve.Valid = true;
8041                ve.VertexBufferIndex =
8042                   util_bitcount64(ice->state.bound_vertex_buffers);
8043                ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
8044                ve.Component0Control = base_ctrl;
8045                ve.Component1Control = base_ctrl;
8046                ve.Component2Control = VFCOMP_STORE_0;
8047                ve.Component3Control = VFCOMP_STORE_0;
8048             }
8049             ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
8050          }
8051          if (ice->state.vs_uses_derived_draw_params) {
8052             iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
8053                ve.Valid = true;
8054                ve.VertexBufferIndex =
8055                   util_bitcount64(ice->state.bound_vertex_buffers) +
8056                   ice->state.vs_uses_draw_params;
8057                ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
8058                ve.Component0Control = VFCOMP_STORE_SRC;
8059                ve.Component1Control = VFCOMP_STORE_SRC;
8060                ve.Component2Control = VFCOMP_STORE_0;
8061                ve.Component3Control = VFCOMP_STORE_0;
8062             }
8063             ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
8064          }
8065          if (ice->state.vs_needs_edge_flag) {
8066             for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length);  i++)
8067                ve_pack_dest[i] = cso->edgeflag_ve[i];
8068          }
8069 
8070          iris_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
8071                          (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
8072       }
8073 
8074       if (!ice->state.vs_needs_edge_flag) {
8075          iris_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *
8076                          entries * GENX(3DSTATE_VF_INSTANCING_length));
8077       } else {
8078          assert(cso->count > 0);
8079          const unsigned edgeflag_index = cso->count - 1;
8080          uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)];
8081          memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index *
8082                 GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t));
8083 
8084          uint32_t *vfi_pack_dest = &dynamic_vfi[0] +
8085             edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length);
8086          iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
8087             vi.VertexElementIndex = edgeflag_index +
8088                ice->state.vs_needs_sgvs_element +
8089                ice->state.vs_uses_derived_draw_params;
8090          }
8091          for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length);  i++)
8092             vfi_pack_dest[i] |= cso->edgeflag_vfi[i];
8093 
8094          iris_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) *
8095                          entries * GENX(3DSTATE_VF_INSTANCING_length));
8096       }
8097    }
8098 
8099    if (dirty & IRIS_DIRTY_VF_SGVS) {
8100       const struct iris_vs_data *vs_data =
8101          iris_vs_data(ice->shaders.prog[MESA_SHADER_VERTEX]);
8102       struct iris_vertex_element_state *cso = ice->state.cso_vertex_elements;
8103 
8104       iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) {
8105          if (vs_data->uses_vertexid) {
8106             sgv.VertexIDEnable = true;
8107             sgv.VertexIDComponentNumber = 2;
8108             sgv.VertexIDElementOffset =
8109                cso->count - ice->state.vs_needs_edge_flag;
8110          }
8111 
8112          if (vs_data->uses_instanceid) {
8113             sgv.InstanceIDEnable = true;
8114             sgv.InstanceIDComponentNumber = 3;
8115             sgv.InstanceIDElementOffset =
8116                cso->count - ice->state.vs_needs_edge_flag;
8117          }
8118       }
8119    }
8120 
8121    if (dirty & IRIS_DIRTY_VF) {
8122       iris_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
8123 #if GFX_VERx10 >= 125
8124          vf.GeometryDistributionEnable = true;
8125 #endif
8126          if (draw->primitive_restart) {
8127             vf.IndexedDrawCutIndexEnable = true;
8128             vf.CutIndex = draw->restart_index;
8129          }
8130       }
8131    }
8132 
8133 #if GFX_VERx10 >= 125
8134    if (dirty & IRIS_DIRTY_VFG) {
8135       iris_emit_cmd(batch, GENX(3DSTATE_VFG), vfg) {
8136          /* If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE*/
8137          vfg.DistributionMode =
8138             ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL ? RR_STRICT :
8139                                                                RR_FREE;
8140          if (intel_needs_workaround(batch->screen->devinfo, 14019166699) &&
8141              program_uses_primitive_id)
8142             vfg.DistributionGranularity = InstanceLevelGranularity;
8143          else
8144             vfg.DistributionGranularity = BatchLevelGranularity;
8145 #if INTEL_WA_14014851047_GFX_VER
8146          vfg.GranularityThresholdDisable =
8147             intel_needs_workaround(batch->screen->devinfo, 14014851047);
8148 #endif
8149          vfg.ListCutIndexEnable = draw->primitive_restart;
8150          /* 192 vertices for TRILIST_ADJ */
8151          vfg.ListNBatchSizeScale = 0;
8152          /* Batch size of 384 vertices */
8153          vfg.List3BatchSizeScale = 2;
8154          /* Batch size of 128 vertices */
8155          vfg.List2BatchSizeScale = 1;
8156          /* Batch size of 128 vertices */
8157          vfg.List1BatchSizeScale = 2;
8158          /* Batch size of 256 vertices for STRIP topologies */
8159          vfg.StripBatchSizeScale = 3;
8160          /* 192 control points for PATCHLIST_3 */
8161          vfg.PatchBatchSizeScale = 1;
8162          /* 192 control points for PATCHLIST_3 */
8163          vfg.PatchBatchSizeMultiplier = 31;
8164       }
8165    }
8166 #endif
8167 
8168    if (dirty & IRIS_DIRTY_VF_STATISTICS) {
8169       iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
8170          vf.StatisticsEnable = true;
8171       }
8172    }
8173 
8174 #if GFX_VER == 8
8175    if (dirty & IRIS_DIRTY_PMA_FIX) {
8176       bool enable = want_pma_fix(ice);
8177       genX(update_pma_fix)(ice, batch, enable);
8178    }
8179 #endif
8180 
8181    if (ice->state.current_hash_scale != 1)
8182       genX(emit_hashing_mode)(ice, batch, UINT_MAX, UINT_MAX, 1);
8183 
8184 #if GFX_VER >= 12
8185    genX(invalidate_aux_map_state)(batch);
8186 #endif
8187 }
8188 
8189 static void
flush_vbos(struct iris_context * ice,struct iris_batch * batch)8190 flush_vbos(struct iris_context *ice, struct iris_batch *batch)
8191 {
8192    struct iris_genx_state *genx = ice->state.genx;
8193    uint64_t bound = ice->state.bound_vertex_buffers;
8194    while (bound) {
8195       const int i = u_bit_scan64(&bound);
8196       struct iris_bo *bo = iris_resource_bo(genx->vertex_buffers[i].resource);
8197       iris_emit_buffer_barrier_for(batch, bo, IRIS_DOMAIN_VF_READ);
8198    }
8199 }
8200 
8201 static bool
point_or_line_list(enum mesa_prim prim_type)8202 point_or_line_list(enum mesa_prim prim_type)
8203 {
8204    switch (prim_type) {
8205    case MESA_PRIM_POINTS:
8206    case MESA_PRIM_LINES:
8207    case MESA_PRIM_LINE_STRIP:
8208    case MESA_PRIM_LINES_ADJACENCY:
8209    case MESA_PRIM_LINE_STRIP_ADJACENCY:
8210    case MESA_PRIM_LINE_LOOP:
8211       return true;
8212    default:
8213       return false;
8214    }
8215    return false;
8216 }
8217 
8218 void
genX(emit_breakpoint)8219 genX(emit_breakpoint)(struct iris_batch *batch, bool emit_before_draw)
8220 {
8221    struct iris_context *ice = batch->ice;
8222    uint32_t draw_count = emit_before_draw ?
8223                          p_atomic_inc_return(&ice->draw_call_count) :
8224                          p_atomic_read(&ice->draw_call_count);
8225 
8226    if (((draw_count == intel_debug_bkp_before_draw_count &&
8227          emit_before_draw) ||
8228         (draw_count == intel_debug_bkp_after_draw_count &&
8229          !emit_before_draw)))  {
8230       iris_emit_cmd(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
8231          sem.WaitMode            = PollingMode;
8232          sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD;
8233          sem.SemaphoreDataDword  = 0x1;
8234          sem.SemaphoreAddress    = rw_bo(batch->screen->breakpoint_bo, 0,
8235                                          IRIS_DOMAIN_OTHER_WRITE);
8236       };
8237    }
8238 }
8239 
8240 void
genX(emit_3dprimitive_was)8241 genX(emit_3dprimitive_was)(struct iris_batch *batch,
8242                            const struct pipe_draw_indirect_info *indirect,
8243                            uint32_t primitive_type,
8244                            uint32_t vertex_count)
8245 {
8246    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
8247    UNUSED const struct iris_context *ice = batch->ice;
8248 
8249 #if INTEL_WA_22014412737_GFX_VER || INTEL_WA_16014538804_GFX_VER
8250    if (intel_needs_workaround(devinfo, 22014412737) &&
8251        (point_or_line_list(primitive_type) || indirect ||
8252         (vertex_count == 1 || vertex_count == 2))) {
8253          iris_emit_pipe_control_write(batch, "Wa_22014412737",
8254                                       PIPE_CONTROL_WRITE_IMMEDIATE,
8255                                       batch->screen->workaround_bo,
8256                                       batch->screen->workaround_address.offset,
8257                                       0ull);
8258       batch->num_3d_primitives_emitted = 0;
8259    } else if (intel_needs_workaround(devinfo, 16014538804)) {
8260       batch->num_3d_primitives_emitted++;
8261 
8262       /* Wa_16014538804 - Send empty/dummy pipe control after 3 3DPRIMITIVE. */
8263       if (batch->num_3d_primitives_emitted == 3) {
8264          iris_emit_pipe_control_flush(batch, "Wa_16014538804", 0);
8265          batch->num_3d_primitives_emitted = 0;
8266       }
8267    }
8268 #endif
8269 }
8270 
8271 void
genX(urb_workaround)8272 genX(urb_workaround)(struct iris_batch *batch,
8273                      const struct intel_urb_config *urb_cfg)
8274 {
8275 #if INTEL_NEEDS_WA_16014912113
8276    if (intel_urb_setup_changed(urb_cfg, &batch->ice->shaders.last_urb,
8277                                MESA_SHADER_TESS_EVAL) &&
8278        batch->ice->shaders.last_urb.size[0] != 0) {
8279       for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
8280          iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
8281             urb._3DCommandSubOpcode += i;
8282             urb.VSURBStartingAddress =
8283                batch->ice->shaders.last_urb.start[i];
8284             urb.VSURBEntryAllocationSize =
8285                batch->ice->shaders.last_urb.size[i] - 1;
8286             urb.VSNumberofURBEntries = i == 0 ? 256 : 0;
8287          }
8288       }
8289       iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
8290          pc.HDCPipelineFlushEnable = true;
8291       }
8292    }
8293 #endif
8294 
8295    /* Update current urb config. */
8296    memcpy(&batch->ice->shaders.last_urb, &batch->ice->shaders.urb.cfg,
8297           sizeof(struct intel_urb_config));
8298 }
8299 
8300 static void
iris_emit_index_buffer(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw,const struct pipe_draw_start_count_bias * sc)8301 iris_emit_index_buffer(struct iris_context *ice,
8302                        struct iris_batch *batch,
8303                        const struct pipe_draw_info *draw,
8304                        const struct pipe_draw_start_count_bias *sc)
8305 {
8306    unsigned offset;
8307 
8308    if (draw->has_user_indices) {
8309       unsigned start_offset = draw->index_size * sc->start;
8310 
8311       u_upload_data(ice->ctx.const_uploader, start_offset,
8312                     sc->count * draw->index_size, 4,
8313                     (char*)draw->index.user + start_offset,
8314                     &offset, &ice->state.last_res.index_buffer);
8315       offset -= start_offset;
8316    } else {
8317       struct iris_resource *res = (void *) draw->index.resource;
8318       res->bind_history |= PIPE_BIND_INDEX_BUFFER;
8319 
8320       pipe_resource_reference(&ice->state.last_res.index_buffer,
8321                               draw->index.resource);
8322       offset = 0;
8323 
8324       iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_VF_READ);
8325    }
8326 
8327    struct iris_genx_state *genx = ice->state.genx;
8328    struct iris_bo *bo = iris_resource_bo(ice->state.last_res.index_buffer);
8329 
8330    uint32_t ib_packet[GENX(3DSTATE_INDEX_BUFFER_length)];
8331    iris_pack_command(GENX(3DSTATE_INDEX_BUFFER), ib_packet, ib) {
8332       ib.IndexFormat = draw->index_size >> 1;
8333       ib.MOCS = iris_mocs(bo, &batch->screen->isl_dev,
8334                           ISL_SURF_USAGE_INDEX_BUFFER_BIT);
8335       ib.BufferSize = bo->size - offset;
8336       ib.BufferStartingAddress = ro_bo(NULL, bo->address + offset);
8337 #if GFX_VER >= 12
8338       ib.L3BypassDisable       = true;
8339 #endif
8340    }
8341 
8342    if (memcmp(genx->last_index_buffer, ib_packet, sizeof(ib_packet)) != 0) {
8343       memcpy(genx->last_index_buffer, ib_packet, sizeof(ib_packet));
8344       iris_batch_emit(batch, ib_packet, sizeof(ib_packet));
8345       iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_VF_READ);
8346    }
8347 
8348 #if GFX_VER < 11
8349    /* The VF cache key only uses 32-bits, see vertex buffer comment above */
8350    uint16_t high_bits = bo->address >> 32ull;
8351    if (high_bits != ice->state.last_index_bo_high_bits) {
8352       iris_emit_pipe_control_flush(batch,
8353                                    "workaround: VF cache 32-bit key [IB]",
8354                                    PIPE_CONTROL_VF_CACHE_INVALIDATE |
8355                                    PIPE_CONTROL_CS_STALL);
8356       ice->state.last_index_bo_high_bits = high_bits;
8357    }
8358 #endif
8359 }
8360 
8361 
8362 static void
iris_upload_render_state(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * sc)8363 iris_upload_render_state(struct iris_context *ice,
8364                          struct iris_batch *batch,
8365                          const struct pipe_draw_info *draw,
8366                          unsigned drawid_offset,
8367                          const struct pipe_draw_indirect_info *indirect,
8368                          const struct pipe_draw_start_count_bias *sc)
8369 {
8370    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
8371    bool use_predicate = ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
8372 
8373    trace_intel_begin_draw(&batch->trace);
8374 
8375    if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
8376       flush_vbos(ice, batch);
8377 
8378    iris_batch_sync_region_start(batch);
8379 
8380    /* Always pin the binder.  If we're emitting new binding table pointers,
8381     * we need it.  If not, we're probably inheriting old tables via the
8382     * context, and need it anyway.  Since true zero-bindings cases are
8383     * practically non-existent, just pin it and avoid last_res tracking.
8384     */
8385    iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8386                       IRIS_DOMAIN_NONE);
8387 
8388    if (!batch->contains_draw) {
8389       if (GFX_VER == 12) {
8390          /* Re-emit constants when starting a new batch buffer in order to
8391           * work around push constant corruption on context switch.
8392           *
8393           * XXX - Provide hardware spec quotation when available.
8394           */
8395          ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS  |
8396                                     IRIS_STAGE_DIRTY_CONSTANTS_TCS |
8397                                     IRIS_STAGE_DIRTY_CONSTANTS_TES |
8398                                     IRIS_STAGE_DIRTY_CONSTANTS_GS  |
8399                                     IRIS_STAGE_DIRTY_CONSTANTS_FS);
8400       }
8401       batch->contains_draw = true;
8402    }
8403 
8404    if (!batch->contains_draw_with_next_seqno) {
8405       iris_restore_render_saved_bos(ice, batch, draw);
8406       batch->contains_draw_with_next_seqno = true;
8407    }
8408 
8409    /* Wa_1306463417 - Send HS state for every primitive on gfx11.
8410     * Wa_16011107343 (same for gfx12)
8411     * We implement this by setting TCS dirty on each draw.
8412     */
8413    if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
8414        ice->shaders.prog[MESA_SHADER_TESS_CTRL]) {
8415       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS;
8416    }
8417 
8418    iris_upload_dirty_render_state(ice, batch, draw, false);
8419 
8420    if (draw->index_size > 0)
8421       iris_emit_index_buffer(ice, batch, draw, sc);
8422 
8423    if (indirect) {
8424       struct mi_builder b;
8425       uint32_t mocs;
8426       mi_builder_init(&b, batch->screen->devinfo, batch);
8427 
8428 #define _3DPRIM_END_OFFSET          0x2420
8429 #define _3DPRIM_START_VERTEX        0x2430
8430 #define _3DPRIM_VERTEX_COUNT        0x2434
8431 #define _3DPRIM_INSTANCE_COUNT      0x2438
8432 #define _3DPRIM_START_INSTANCE      0x243C
8433 #define _3DPRIM_BASE_VERTEX         0x2440
8434 
8435       if (!indirect->count_from_stream_output) {
8436          if (indirect->indirect_draw_count) {
8437             use_predicate = true;
8438 
8439             struct iris_bo *draw_count_bo =
8440                iris_resource_bo(indirect->indirect_draw_count);
8441             unsigned draw_count_offset =
8442                indirect->indirect_draw_count_offset;
8443             mocs = iris_mocs(draw_count_bo, &batch->screen->isl_dev, 0);
8444             mi_builder_set_mocs(&b, mocs);
8445 
8446             if (ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT) {
8447                /* comparison = draw id < draw count */
8448                struct mi_value comparison =
8449                   mi_ult(&b, mi_imm(drawid_offset),
8450                              mi_mem32(ro_bo(draw_count_bo, draw_count_offset)));
8451 
8452                /* predicate = comparison & conditional rendering predicate */
8453                mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
8454                             mi_iand(&b, comparison, mi_reg32(CS_GPR(15))));
8455             } else {
8456                uint32_t mi_predicate;
8457 
8458                /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
8459                mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(drawid_offset));
8460                /* Upload the current draw count from the draw parameters buffer
8461                 * to MI_PREDICATE_SRC0. Zero the top 32-bits of
8462                 * MI_PREDICATE_SRC0.
8463                 */
8464                mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
8465                         mi_mem32(ro_bo(draw_count_bo, draw_count_offset)));
8466 
8467                if (drawid_offset == 0) {
8468                   mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
8469                                  MI_PREDICATE_COMBINEOP_SET |
8470                                  MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
8471                } else {
8472                   /* While draw_index < draw_count the predicate's result will be
8473                    *  (draw_index == draw_count) ^ TRUE = TRUE
8474                    * When draw_index == draw_count the result is
8475                    *  (TRUE) ^ TRUE = FALSE
8476                    * After this all results will be:
8477                    *  (FALSE) ^ FALSE = FALSE
8478                    */
8479                   mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
8480                                  MI_PREDICATE_COMBINEOP_XOR |
8481                                  MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
8482                }
8483                iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
8484             }
8485          }
8486          struct iris_bo *bo = iris_resource_bo(indirect->buffer);
8487          assert(bo);
8488 
8489          mocs = iris_mocs(bo, &batch->screen->isl_dev, 0);
8490          mi_builder_set_mocs(&b, mocs);
8491 
8492          mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
8493                   mi_mem32(ro_bo(bo, indirect->offset + 0)));
8494          mi_store(&b, mi_reg32(_3DPRIM_INSTANCE_COUNT),
8495                   mi_mem32(ro_bo(bo, indirect->offset + 4)));
8496          mi_store(&b, mi_reg32(_3DPRIM_START_VERTEX),
8497                   mi_mem32(ro_bo(bo, indirect->offset + 8)));
8498          if (draw->index_size) {
8499             mi_store(&b, mi_reg32(_3DPRIM_BASE_VERTEX),
8500                      mi_mem32(ro_bo(bo, indirect->offset + 12)));
8501             mi_store(&b, mi_reg32(_3DPRIM_START_INSTANCE),
8502                      mi_mem32(ro_bo(bo, indirect->offset + 16)));
8503          } else {
8504             mi_store(&b, mi_reg32(_3DPRIM_START_INSTANCE),
8505                      mi_mem32(ro_bo(bo, indirect->offset + 12)));
8506             mi_store(&b, mi_reg32(_3DPRIM_BASE_VERTEX), mi_imm(0));
8507          }
8508       } else if (indirect->count_from_stream_output) {
8509          struct iris_stream_output_target *so =
8510             (void *) indirect->count_from_stream_output;
8511          struct iris_bo *so_bo = iris_resource_bo(so->offset.res);
8512 
8513          mocs = iris_mocs(so_bo, &batch->screen->isl_dev, 0);
8514          mi_builder_set_mocs(&b, mocs);
8515 
8516          iris_emit_buffer_barrier_for(batch, so_bo, IRIS_DOMAIN_OTHER_READ);
8517 
8518          struct iris_address addr = ro_bo(so_bo, so->offset.offset);
8519          struct mi_value offset =
8520             mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
8521          mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
8522                       mi_udiv32_imm(&b, offset, so->stride));
8523          mi_store(&b, mi_reg32(_3DPRIM_START_VERTEX), mi_imm(0));
8524          mi_store(&b, mi_reg32(_3DPRIM_BASE_VERTEX), mi_imm(0));
8525          mi_store(&b, mi_reg32(_3DPRIM_START_INSTANCE), mi_imm(0));
8526          mi_store(&b, mi_reg32(_3DPRIM_INSTANCE_COUNT),
8527                   mi_imm(draw->instance_count));
8528       }
8529    }
8530 
8531    iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
8532 
8533    genX(maybe_emit_breakpoint)(batch, true);
8534 
8535    iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
8536       prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
8537       prim.PredicateEnable = use_predicate;
8538 #if GFX_VERx10 >= 125
8539       prim.TBIMREnable = ice->state.use_tbimr;
8540 #endif
8541       if (indirect) {
8542          prim.IndirectParameterEnable = true;
8543       } else {
8544          prim.StartInstanceLocation = draw->start_instance;
8545          prim.InstanceCount = draw->instance_count;
8546          prim.VertexCountPerInstance = sc->count;
8547 
8548          prim.StartVertexLocation = sc->start;
8549 
8550          if (draw->index_size) {
8551             prim.BaseVertexLocation += sc->index_bias;
8552          }
8553       }
8554    }
8555 
8556    genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
8557    genX(maybe_emit_breakpoint)(batch, false);
8558 
8559    iris_batch_sync_region_end(batch);
8560 
8561    uint32_t count = (sc) ? sc->count : 0;
8562    count *= draw->instance_count ? draw->instance_count : 1;
8563    trace_intel_end_draw(&batch->trace, count);
8564 }
8565 
8566 static void
iris_upload_indirect_render_state(struct iris_context * ice,const struct pipe_draw_info * draw,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * sc)8567 iris_upload_indirect_render_state(struct iris_context *ice,
8568                                   const struct pipe_draw_info *draw,
8569                                   const struct pipe_draw_indirect_info *indirect,
8570                                   const struct pipe_draw_start_count_bias *sc)
8571 {
8572 #if GFX_VERx10 >= 125
8573    assert(indirect);
8574 
8575    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
8576    UNUSED struct iris_screen *screen = batch->screen;
8577    UNUSED const struct intel_device_info *devinfo = screen->devinfo;
8578    const bool use_predicate =
8579       ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
8580 
8581    trace_intel_begin_draw(&batch->trace);
8582 
8583    if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
8584       flush_vbos(ice, batch);
8585 
8586    iris_batch_sync_region_start(batch);
8587 
8588    /* Always pin the binder.  If we're emitting new binding table pointers,
8589     * we need it.  If not, we're probably inheriting old tables via the
8590     * context, and need it anyway.  Since true zero-bindings cases are
8591     * practically non-existent, just pin it and avoid last_res tracking.
8592     */
8593    iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8594                       IRIS_DOMAIN_NONE);
8595 
8596    if (!batch->contains_draw) {
8597       /* Re-emit constants when starting a new batch buffer in order to
8598        * work around push constant corruption on context switch.
8599        *
8600        * XXX - Provide hardware spec quotation when available.
8601        */
8602       ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS  |
8603                                  IRIS_STAGE_DIRTY_CONSTANTS_TCS |
8604                                  IRIS_STAGE_DIRTY_CONSTANTS_TES |
8605                                  IRIS_STAGE_DIRTY_CONSTANTS_GS  |
8606                                  IRIS_STAGE_DIRTY_CONSTANTS_FS);
8607       batch->contains_draw = true;
8608    }
8609 
8610    if (!batch->contains_draw_with_next_seqno) {
8611       iris_restore_render_saved_bos(ice, batch, draw);
8612       batch->contains_draw_with_next_seqno = true;
8613    }
8614 
8615    /* Wa_1306463417 - Send HS state for every primitive on gfx11.
8616     * Wa_16011107343 (same for gfx12)
8617     * We implement this by setting TCS dirty on each draw.
8618     */
8619    if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
8620        ice->shaders.prog[MESA_SHADER_TESS_CTRL]) {
8621       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS;
8622    }
8623 
8624    iris_upload_dirty_render_state(ice, batch, draw, false);
8625 
8626    if (draw->index_size > 0)
8627       iris_emit_index_buffer(ice, batch, draw, sc);
8628 
8629    iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
8630 
8631    genX(maybe_emit_breakpoint)(batch, true);
8632 
8633    iris_emit_cmd(batch, GENX(EXECUTE_INDIRECT_DRAW), ind) {
8634       ind.ArgumentFormat             =
8635          draw->index_size > 0 ? XI_DRAWINDEXED : XI_DRAW;
8636       ind.PredicateEnable            = use_predicate;
8637       ind.TBIMREnabled               = ice->state.use_tbimr;
8638       ind.MaxCount                   = indirect->draw_count;
8639 
8640       if (indirect->buffer) {
8641          struct iris_bo *bo = iris_resource_bo(indirect->buffer);
8642          ind.ArgumentBufferStartAddress = ro_bo(bo, indirect->offset);
8643          ind.MOCS = iris_mocs(bo, &screen->isl_dev, 0);
8644          } else {
8645          ind.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
8646       }
8647 
8648       if (indirect->indirect_draw_count) {
8649          struct iris_bo *draw_count_bo      =
8650             iris_resource_bo(indirect->indirect_draw_count);
8651          ind.CountBufferIndirectEnable      = true;
8652          ind.CountBufferAddress             =
8653             ro_bo(draw_count_bo, indirect->indirect_draw_count_offset);
8654       }
8655    }
8656 
8657    genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
8658    genX(maybe_emit_breakpoint)(batch, false);
8659 
8660    iris_batch_sync_region_end(batch);
8661 
8662    uint32_t count = (sc) ? sc->count : 0;
8663    count *= draw->instance_count ? draw->instance_count : 1;
8664    trace_intel_end_draw(&batch->trace, count);
8665 #else
8666    unreachable("Unsupported path");
8667 #endif /* GFX_VERx10 >= 125 */
8668 }
8669 
8670 static void
iris_upload_indirect_shader_render_state(struct iris_context * ice,const struct pipe_draw_info * draw,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * sc)8671 iris_upload_indirect_shader_render_state(struct iris_context *ice,
8672                                          const struct pipe_draw_info *draw,
8673                                          const struct pipe_draw_indirect_info *indirect,
8674                                          const struct pipe_draw_start_count_bias *sc)
8675 {
8676    assert(indirect);
8677 
8678    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
8679    UNUSED struct iris_screen *screen = batch->screen;
8680    UNUSED const struct intel_device_info *devinfo = screen->devinfo;
8681 
8682    if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
8683       flush_vbos(ice, batch);
8684 
8685    iris_batch_sync_region_start(batch);
8686 
8687    /* Always pin the binder.  If we're emitting new binding table pointers,
8688     * we need it.  If not, we're probably inheriting old tables via the
8689     * context, and need it anyway.  Since true zero-bindings cases are
8690     * practically non-existent, just pin it and avoid last_res tracking.
8691     */
8692    iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8693                       IRIS_DOMAIN_NONE);
8694 
8695    if (!batch->contains_draw) {
8696       if (GFX_VER == 12) {
8697          /* Re-emit constants when starting a new batch buffer in order to
8698           * work around push constant corruption on context switch.
8699           *
8700           * XXX - Provide hardware spec quotation when available.
8701           */
8702          ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS  |
8703                                     IRIS_STAGE_DIRTY_CONSTANTS_TCS |
8704                                     IRIS_STAGE_DIRTY_CONSTANTS_TES |
8705                                     IRIS_STAGE_DIRTY_CONSTANTS_GS  |
8706                                     IRIS_STAGE_DIRTY_CONSTANTS_FS);
8707       }
8708       batch->contains_draw = true;
8709    }
8710 
8711    if (!batch->contains_draw_with_next_seqno) {
8712       iris_restore_render_saved_bos(ice, batch, draw);
8713       batch->contains_draw_with_next_seqno = true;
8714    }
8715 
8716    if (draw->index_size > 0)
8717       iris_emit_index_buffer(ice, batch, draw, sc);
8718 
8719    /* Make sure we have enough space to keep all the commands in the single BO
8720     * (because of the jumps)
8721     */
8722    iris_require_command_space(batch, 2000);
8723 
8724 #ifndef NDEBUG
8725    struct iris_bo *command_bo = batch->bo;
8726 #endif
8727 
8728    /* Jump point to generate more draw if we run out of space in the ring
8729     * buffer.
8730     */
8731    uint64_t gen_addr = iris_batch_current_address_u64(batch);
8732 
8733    iris_handle_always_flush_cache(batch);
8734 
8735 #if GFX_VER == 9
8736    iris_emit_pipe_control_flush(batch, "before generation",
8737                                 PIPE_CONTROL_VF_CACHE_INVALIDATE);
8738 #endif
8739 
8740    struct iris_address params_addr;
8741    struct iris_gen_indirect_params *params =
8742       genX(emit_indirect_generate)(batch, draw, indirect, sc,
8743                                    &params_addr);
8744 
8745    iris_emit_pipe_control_flush(batch, "after generation flush",
8746                                 ((ice->state.vs_uses_draw_params ||
8747                                   ice->state.vs_uses_derived_draw_params) ?
8748                                  PIPE_CONTROL_VF_CACHE_INVALIDATE : 0) |
8749                                 PIPE_CONTROL_STALL_AT_SCOREBOARD |
8750                                 PIPE_CONTROL_DATA_CACHE_FLUSH |
8751                                 PIPE_CONTROL_CS_STALL);
8752 
8753    trace_intel_begin_draw(&batch->trace);
8754 
8755    /* Always pin the binder.  If we're emitting new binding table pointers,
8756     * we need it.  If not, we're probably inheriting old tables via the
8757     * context, and need it anyway.  Since true zero-bindings cases are
8758     * practically non-existent, just pin it and avoid last_res tracking.
8759     */
8760    iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8761                       IRIS_DOMAIN_NONE);
8762 
8763    /* Wa_1306463417 - Send HS state for every primitive on gfx11.
8764     * Wa_16011107343 (same for gfx12)
8765     * We implement this by setting TCS dirty on each draw.
8766     */
8767    if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
8768        ice->shaders.prog[MESA_SHADER_TESS_CTRL]) {
8769       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS;
8770    }
8771 
8772    iris_upload_dirty_render_state(ice, batch, draw, true);
8773 
8774    iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
8775 
8776    genX(maybe_emit_breakpoint)(batch, true);
8777 
8778 #if GFX_VER >= 12
8779    iris_emit_cmd(batch, GENX(MI_ARB_CHECK), arb) {
8780       arb.PreParserDisableMask = true;
8781       arb.PreParserDisable = true;
8782    }
8783 #endif
8784 
8785    iris_emit_cmd(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
8786       bbs.AddressSpaceIndicator = ASI_PPGTT;
8787       bbs.BatchBufferStartAddress = (struct iris_address) {
8788          .bo = ice->draw.generation.ring_bo,
8789       };
8790    }
8791 
8792    /* Run the ring buffer one more time with the next set of commands */
8793    uint64_t inc_addr = iris_batch_current_address_u64(batch);
8794    {
8795       iris_emit_pipe_control_flush(batch,
8796                                    "post generated draws wait",
8797                                    PIPE_CONTROL_STALL_AT_SCOREBOARD |
8798                                    PIPE_CONTROL_CS_STALL);
8799 
8800       struct mi_builder b;
8801       mi_builder_init(&b, batch->screen->devinfo, batch);
8802 
8803       struct iris_address draw_base_addr = iris_address_add(
8804          params_addr,
8805          offsetof(struct iris_gen_indirect_params, draw_base));
8806 
8807       const uint32_t mocs =
8808          iris_mocs(draw_base_addr.bo, &screen->isl_dev, 0);
8809       mi_builder_set_mocs(&b, mocs);
8810 
8811       mi_store(&b, mi_mem32(draw_base_addr),
8812                    mi_iadd(&b, mi_mem32(draw_base_addr),
8813                                mi_imm(params->ring_count)));
8814 
8815       iris_emit_pipe_control_flush(batch,
8816                                    "post generation base increment",
8817                                    PIPE_CONTROL_CS_STALL |
8818                                    PIPE_CONTROL_CONST_CACHE_INVALIDATE);
8819 
8820       iris_emit_cmd(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
8821          bbs.AddressSpaceIndicator = ASI_PPGTT;
8822          bbs.BatchBufferStartAddress = (struct iris_address) {
8823             .offset = gen_addr,
8824          };
8825       }
8826    }
8827 
8828    /* Exit of the ring buffer */
8829    uint64_t end_addr = iris_batch_current_address_u64(batch);
8830 
8831 #ifndef NDEBUG
8832    assert(command_bo == batch->bo);
8833 #endif
8834 
8835    genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
8836    genX(maybe_emit_breakpoint)(batch, false);
8837 
8838    iris_emit_pipe_control_flush(batch,
8839                                 "post generated draws wait",
8840                                 PIPE_CONTROL_STALL_AT_SCOREBOARD |
8841                                 PIPE_CONTROL_CS_STALL);
8842 
8843    params->gen_addr = inc_addr;
8844    params->end_addr = end_addr;
8845 
8846    iris_batch_sync_region_end(batch);
8847 
8848    uint32_t count = (sc) ? sc->count : 0;
8849    count *= draw->instance_count ? draw->instance_count : 1;
8850    trace_intel_end_draw(&batch->trace, count);
8851 }
8852 
8853 static void
iris_load_indirect_location(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)8854 iris_load_indirect_location(struct iris_context *ice,
8855                             struct iris_batch *batch,
8856                             const struct pipe_grid_info *grid)
8857 {
8858 #define GPGPU_DISPATCHDIMX 0x2500
8859 #define GPGPU_DISPATCHDIMY 0x2504
8860 #define GPGPU_DISPATCHDIMZ 0x2508
8861 
8862    assert(grid->indirect);
8863 
8864    struct iris_state_ref *grid_size = &ice->state.grid_size;
8865    struct iris_bo *bo = iris_resource_bo(grid_size->res);
8866    struct mi_builder b;
8867    mi_builder_init(&b, batch->screen->devinfo, batch);
8868    struct mi_value size_x = mi_mem32(ro_bo(bo, grid_size->offset + 0));
8869    struct mi_value size_y = mi_mem32(ro_bo(bo, grid_size->offset + 4));
8870    struct mi_value size_z = mi_mem32(ro_bo(bo, grid_size->offset + 8));
8871    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
8872    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
8873    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
8874 }
8875 
iris_emit_indirect_dispatch_supported(const struct intel_device_info * devinfo)8876 static bool iris_emit_indirect_dispatch_supported(const struct intel_device_info *devinfo)
8877 {
8878    // TODO: Swizzling X and Y workgroup sizes is not supported in execute indirect dispatch
8879    return devinfo->has_indirect_unroll;
8880 }
8881 
8882 #if GFX_VERx10 >= 125
8883 
iris_emit_execute_indirect_dispatch(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid,const struct GENX (INTERFACE_DESCRIPTOR_DATA)idd)8884 static void iris_emit_execute_indirect_dispatch(struct iris_context *ice,
8885                                                 struct iris_batch *batch,
8886                                                 const struct pipe_grid_info *grid,
8887                                                 const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd)
8888 {
8889    const struct iris_screen *screen = batch->screen;
8890    struct iris_compiled_shader *shader =
8891       ice->shaders.prog[MESA_SHADER_COMPUTE];
8892    const struct iris_cs_data *cs_data = iris_cs_data(shader);
8893    const struct intel_cs_dispatch_info dispatch =
8894       iris_get_cs_dispatch_info(screen->devinfo, shader, grid->block);
8895    struct iris_bo *indirect = iris_resource_bo(grid->indirect);
8896    const int dispatch_size = dispatch.simd_size / 16;
8897 
8898    struct GENX(COMPUTE_WALKER_BODY) body = {};
8899    body.SIMDSize            = dispatch_size;
8900    body.MessageSIMD         = dispatch_size;
8901    body.GenerateLocalID     = cs_data->generate_local_id != 0;
8902    body.EmitLocal           = cs_data->generate_local_id;
8903    body.WalkOrder           = cs_data->walk_order;
8904    body.TileLayout          = cs_data->walk_order == INTEL_WALK_ORDER_YXZ ?
8905                               TileY32bpe : Linear;
8906    body.LocalXMaximum       = grid->block[0] - 1;
8907    body.LocalYMaximum       = grid->block[1] - 1;
8908    body.LocalZMaximum       = grid->block[2] - 1;
8909    body.ExecutionMask       = dispatch.right_mask;
8910    body.PostSync.MOCS       = iris_mocs(NULL, &screen->isl_dev, 0);
8911    body.InterfaceDescriptor = idd;
8912 
8913    struct iris_address indirect_bo = ro_bo(indirect, grid->indirect_offset);
8914    iris_emit_cmd(batch, GENX(EXECUTE_INDIRECT_DISPATCH), ind) {
8915       ind.PredicateEnable            =
8916          ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
8917       ind.MaxCount                   = 1;
8918       ind.COMPUTE_WALKER_BODY        = body;
8919       ind.ArgumentBufferStartAddress = indirect_bo;
8920       ind.MOCS                       =
8921          iris_mocs(indirect_bo.bo, &screen->isl_dev, 0);
8922    }
8923 }
8924 
8925 static void
iris_upload_compute_walker(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)8926 iris_upload_compute_walker(struct iris_context *ice,
8927                            struct iris_batch *batch,
8928                            const struct pipe_grid_info *grid)
8929 {
8930    const uint64_t stage_dirty = ice->state.stage_dirty;
8931    struct iris_screen *screen = batch->screen;
8932    const struct intel_device_info *devinfo = screen->devinfo;
8933    struct iris_binder *binder = &ice->state.binder;
8934    struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
8935    struct iris_compiled_shader *shader =
8936       ice->shaders.prog[MESA_SHADER_COMPUTE];
8937    const struct iris_cs_data *cs_data = iris_cs_data(shader);
8938    const struct intel_cs_dispatch_info dispatch =
8939       iris_get_cs_dispatch_info(devinfo, shader, grid->block);
8940 
8941    trace_intel_begin_compute(&batch->trace);
8942 
8943    if (stage_dirty & IRIS_STAGE_DIRTY_CS) {
8944       iris_emit_cmd(batch, GENX(CFE_STATE), cfe) {
8945          cfe.MaximumNumberofThreads =
8946             devinfo->max_cs_threads * devinfo->subslice_total;
8947          uint32_t scratch_addr = pin_scratch_space(ice, batch, shader,
8948                                                    MESA_SHADER_COMPUTE);
8949          cfe.ScratchSpaceBuffer = scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;
8950       }
8951    }
8952 
8953    struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {};
8954    idd.KernelStartPointer = KSP(shader);
8955    idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
8956    idd.SharedLocalMemorySize =
8957       intel_compute_slm_encode_size(GFX_VER, shader->total_shared);
8958    idd.PreferredSLMAllocationSize =
8959       intel_compute_preferred_slm_calc_encode_size(devinfo,
8960                                                    shader->total_shared,
8961                                                    dispatch.group_size,
8962                                                    dispatch.simd_size);
8963    idd.SamplerStatePointer = shs->sampler_table.offset;
8964    idd.SamplerCount = encode_sampler_count(shader),
8965    idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];
8966    /* Typically set to 0 to avoid prefetching on every thread dispatch. */
8967    idd.BindingTableEntryCount = devinfo->verx10 == 125 ?
8968       0 : MIN2(shader->bt.size_bytes / 4, 31);
8969    idd.NumberOfBarriers = cs_data->uses_barrier;
8970 
8971    iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
8972 
8973    if (iris_emit_indirect_dispatch_supported(devinfo) && grid->indirect) {
8974       iris_emit_execute_indirect_dispatch(ice, batch, grid, idd);
8975    } else {
8976       if (grid->indirect)
8977          iris_load_indirect_location(ice, batch, grid);
8978 
8979       iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
8980 
8981       ice->utrace.last_compute_walker =
8982          iris_emit_dwords(batch, GENX(COMPUTE_WALKER_length));
8983       _iris_pack_command(batch, GENX(COMPUTE_WALKER),
8984                          ice->utrace.last_compute_walker, cw) {
8985          cw.IndirectParameterEnable        = grid->indirect;
8986          cw.SIMDSize                       = dispatch.simd_size / 16;
8987          cw.MessageSIMD                    = dispatch.simd_size / 16;
8988          cw.LocalXMaximum                  = grid->block[0] - 1;
8989          cw.LocalYMaximum                  = grid->block[1] - 1;
8990          cw.LocalZMaximum                  = grid->block[2] - 1;
8991          cw.ThreadGroupIDXDimension        = grid->grid[0];
8992          cw.ThreadGroupIDYDimension        = grid->grid[1];
8993          cw.ThreadGroupIDZDimension        = grid->grid[2];
8994          cw.ExecutionMask                  = dispatch.right_mask;
8995          cw.PostSync.MOCS                  = iris_mocs(NULL, &screen->isl_dev, 0);
8996          cw.InterfaceDescriptor            = idd;
8997 
8998 #if GFX_VERx10 >= 125
8999          cw.GenerateLocalID = cs_data->generate_local_id != 0;
9000          cw.EmitLocal       = cs_data->generate_local_id;
9001          cw.WalkOrder       = cs_data->walk_order;
9002          cw.TileLayout = cs_data->walk_order == INTEL_WALK_ORDER_YXZ ?
9003                          TileY32bpe : Linear;
9004 #endif
9005 
9006          assert(iris_cs_push_const_total_size(shader, dispatch.threads) == 0);
9007       }
9008    }
9009 
9010    trace_intel_end_compute(&batch->trace, grid->grid[0], grid->grid[1], grid->grid[2]);
9011 }
9012 
9013 #else /* #if GFX_VERx10 >= 125 */
9014 
9015 static void
iris_upload_gpgpu_walker(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)9016 iris_upload_gpgpu_walker(struct iris_context *ice,
9017                          struct iris_batch *batch,
9018                          const struct pipe_grid_info *grid)
9019 {
9020    const uint64_t stage_dirty = ice->state.stage_dirty;
9021    struct iris_screen *screen = batch->screen;
9022    const struct intel_device_info *devinfo = screen->devinfo;
9023    struct iris_binder *binder = &ice->state.binder;
9024    struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
9025    struct iris_uncompiled_shader *ish =
9026       ice->shaders.uncompiled[MESA_SHADER_COMPUTE];
9027    struct iris_compiled_shader *shader =
9028       ice->shaders.prog[MESA_SHADER_COMPUTE];
9029    struct iris_cs_data *cs_data = iris_cs_data(shader);
9030    const struct intel_cs_dispatch_info dispatch =
9031       iris_get_cs_dispatch_info(screen->devinfo, shader, grid->block);
9032 
9033    trace_intel_begin_compute(&batch->trace);
9034 
9035    if ((stage_dirty & IRIS_STAGE_DIRTY_CS) ||
9036        cs_data->local_size[0] == 0 /* Variable local group size */) {
9037       /* The MEDIA_VFE_STATE documentation for Gfx8+ says:
9038        *
9039        *   "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
9040        *    the only bits that are changed are scoreboard related: Scoreboard
9041        *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta.  For
9042        *    these scoreboard related states, a MEDIA_STATE_FLUSH is
9043        *    sufficient."
9044        */
9045       iris_emit_pipe_control_flush(batch,
9046                                    "workaround: stall before MEDIA_VFE_STATE",
9047                                    PIPE_CONTROL_CS_STALL);
9048 
9049       iris_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
9050          if (shader->total_scratch) {
9051             uint32_t scratch_addr =
9052                pin_scratch_space(ice, batch, shader, MESA_SHADER_COMPUTE);
9053 
9054             vfe.PerThreadScratchSpace = ffs(shader->total_scratch) - 11;
9055             vfe.ScratchSpaceBasePointer =
9056                rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);
9057          }
9058 
9059          vfe.MaximumNumberofThreads =
9060             devinfo->max_cs_threads * devinfo->subslice_total - 1;
9061 #if GFX_VER < 11
9062          vfe.ResetGatewayTimer =
9063             Resettingrelativetimerandlatchingtheglobaltimestamp;
9064 #endif
9065 #if GFX_VER == 8
9066          vfe.BypassGatewayControl = true;
9067 #endif
9068          vfe.NumberofURBEntries = 2;
9069          vfe.URBEntryAllocationSize = 2;
9070 
9071          vfe.CURBEAllocationSize =
9072             ALIGN(cs_data->push.per_thread.regs * dispatch.threads +
9073                   cs_data->push.cross_thread.regs, 2);
9074       }
9075    }
9076 
9077    /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
9078    if ((stage_dirty & IRIS_STAGE_DIRTY_CS) ||
9079        cs_data->local_size[0] == 0 /* Variable local group size */) {
9080       uint32_t curbe_data_offset = 0;
9081       assert(cs_data->push.cross_thread.dwords == 0 &&
9082              cs_data->push.per_thread.dwords == 1 &&
9083              cs_data->first_param_is_builtin_subgroup_id);
9084       const unsigned push_const_size =
9085          iris_cs_push_const_total_size(shader, dispatch.threads);
9086       uint32_t *curbe_data_map =
9087          stream_state(batch, ice->state.dynamic_uploader,
9088                       &ice->state.last_res.cs_thread_ids,
9089                       ALIGN(push_const_size, 64), 64,
9090                       &curbe_data_offset);
9091       assert(curbe_data_map);
9092       memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
9093       iris_fill_cs_push_const_buffer(screen, shader, dispatch.threads,
9094                                      curbe_data_map);
9095 
9096       iris_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
9097          curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
9098          curbe.CURBEDataStartAddress = curbe_data_offset;
9099       }
9100    }
9101 
9102    for (unsigned i = 0; i < IRIS_MAX_GLOBAL_BINDINGS; i++) {
9103       struct pipe_resource *res = ice->state.global_bindings[i];
9104       if (!res)
9105          break;
9106 
9107       iris_use_pinned_bo(batch, iris_resource_bo(res),
9108                          true, IRIS_DOMAIN_NONE);
9109    }
9110 
9111    if (stage_dirty & (IRIS_STAGE_DIRTY_SAMPLER_STATES_CS |
9112                       IRIS_STAGE_DIRTY_BINDINGS_CS |
9113                       IRIS_STAGE_DIRTY_CONSTANTS_CS |
9114                       IRIS_STAGE_DIRTY_CS)) {
9115       uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
9116 
9117       iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
9118          idd.SharedLocalMemorySize =
9119             intel_compute_slm_encode_size(GFX_VER, ish->kernel_shared_size + grid->variable_shared_mem);
9120          idd.KernelStartPointer =
9121             KSP(shader) + iris_cs_data_prog_offset(cs_data, dispatch.simd_size);
9122          idd.SamplerStatePointer = shs->sampler_table.offset;
9123          idd.BindingTablePointer =
9124             binder->bt_offset[MESA_SHADER_COMPUTE] >> IRIS_BT_OFFSET_SHIFT;
9125          idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
9126       }
9127 
9128       for (int i = 0; i < GENX(INTERFACE_DESCRIPTOR_DATA_length); i++)
9129          desc[i] |= ((uint32_t *) shader->derived_data)[i];
9130 
9131       iris_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
9132          load.InterfaceDescriptorTotalLength =
9133             GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
9134          load.InterfaceDescriptorDataStartAddress =
9135             emit_state(batch, ice->state.dynamic_uploader,
9136                        &ice->state.last_res.cs_desc, desc, sizeof(desc), 64);
9137       }
9138    }
9139 
9140    if (grid->indirect)
9141       iris_load_indirect_location(ice, batch, grid);
9142 
9143    iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
9144 
9145    iris_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
9146       ggw.IndirectParameterEnable    = grid->indirect != NULL;
9147       ggw.SIMDSize                   = dispatch.simd_size / 16;
9148       ggw.ThreadDepthCounterMaximum  = 0;
9149       ggw.ThreadHeightCounterMaximum = 0;
9150       ggw.ThreadWidthCounterMaximum  = dispatch.threads - 1;
9151       ggw.ThreadGroupIDXDimension    = grid->grid[0];
9152       ggw.ThreadGroupIDYDimension    = grid->grid[1];
9153       ggw.ThreadGroupIDZDimension    = grid->grid[2];
9154       ggw.RightExecutionMask         = dispatch.right_mask;
9155       ggw.BottomExecutionMask        = 0xffffffff;
9156    }
9157 
9158    iris_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
9159 
9160    trace_intel_end_compute(&batch->trace, grid->grid[0], grid->grid[1], grid->grid[2]);
9161 }
9162 
9163 #endif /* #if GFX_VERx10 >= 125 */
9164 
9165 static void
iris_upload_compute_state(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)9166 iris_upload_compute_state(struct iris_context *ice,
9167                           struct iris_batch *batch,
9168                           const struct pipe_grid_info *grid)
9169 {
9170    struct iris_screen *screen = batch->screen;
9171    const uint64_t stage_dirty = ice->state.stage_dirty;
9172    struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
9173    struct iris_compiled_shader *shader =
9174       ice->shaders.prog[MESA_SHADER_COMPUTE];
9175    struct iris_border_color_pool *border_color_pool =
9176       iris_bufmgr_get_border_color_pool(screen->bufmgr);
9177 
9178    iris_batch_sync_region_start(batch);
9179 
9180    /* Always pin the binder.  If we're emitting new binding table pointers,
9181     * we need it.  If not, we're probably inheriting old tables via the
9182     * context, and need it anyway.  Since true zero-bindings cases are
9183     * practically non-existent, just pin it and avoid last_res tracking.
9184     */
9185    iris_use_pinned_bo(batch, ice->state.binder.bo, false, IRIS_DOMAIN_NONE);
9186 
9187    if (((stage_dirty & IRIS_STAGE_DIRTY_CONSTANTS_CS) &&
9188         shs->sysvals_need_upload) ||
9189        shader->kernel_input_size > 0)
9190       upload_sysvals(ice, MESA_SHADER_COMPUTE, grid);
9191 
9192    if (stage_dirty & IRIS_STAGE_DIRTY_BINDINGS_CS)
9193       iris_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
9194 
9195    if (stage_dirty & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS)
9196       iris_upload_sampler_states(ice, MESA_SHADER_COMPUTE);
9197 
9198    iris_use_optional_res(batch, shs->sampler_table.res, false,
9199                          IRIS_DOMAIN_NONE);
9200    iris_use_pinned_bo(batch, iris_resource_bo(shader->assembly.res), false,
9201                       IRIS_DOMAIN_NONE);
9202 
9203    if (ice->state.need_border_colors)
9204       iris_use_pinned_bo(batch, border_color_pool->bo, false,
9205                          IRIS_DOMAIN_NONE);
9206 
9207 #if GFX_VER >= 12
9208    genX(invalidate_aux_map_state)(batch);
9209 #endif
9210 
9211 #if GFX_VERx10 >= 125
9212    iris_upload_compute_walker(ice, batch, grid);
9213 #else
9214    iris_upload_gpgpu_walker(ice, batch, grid);
9215 #endif
9216 
9217    if (!batch->contains_draw_with_next_seqno) {
9218       iris_restore_compute_saved_bos(ice, batch, grid);
9219       batch->contains_draw_with_next_seqno = batch->contains_draw = true;
9220    }
9221 
9222    iris_batch_sync_region_end(batch);
9223 }
9224 
9225 /**
9226  * State module teardown.
9227  */
9228 static void
iris_destroy_state(struct iris_context * ice)9229 iris_destroy_state(struct iris_context *ice)
9230 {
9231    struct iris_genx_state *genx = ice->state.genx;
9232 
9233    pipe_resource_reference(&ice->state.pixel_hashing_tables, NULL);
9234 
9235    pipe_resource_reference(&ice->draw.draw_params.res, NULL);
9236    pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
9237    pipe_resource_reference(&ice->draw.generation.params.res, NULL);
9238    pipe_resource_reference(&ice->draw.generation.vertices.res, NULL);
9239 
9240    /* Loop over all VBOs, including ones for draw parameters */
9241    for (unsigned i = 0; i < ARRAY_SIZE(genx->vertex_buffers); i++) {
9242       pipe_resource_reference(&genx->vertex_buffers[i].resource, NULL);
9243    }
9244 
9245    free(ice->state.genx);
9246 
9247    for (int i = 0; i < 4; i++) {
9248       pipe_so_target_reference(&ice->state.so_target[i], NULL);
9249    }
9250 
9251    util_unreference_framebuffer_state(&ice->state.framebuffer);
9252 
9253    for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
9254       struct iris_shader_state *shs = &ice->state.shaders[stage];
9255       pipe_resource_reference(&shs->sampler_table.res, NULL);
9256       for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
9257          pipe_resource_reference(&shs->constbuf[i].buffer, NULL);
9258          pipe_resource_reference(&shs->constbuf_surf_state[i].res, NULL);
9259       }
9260       for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
9261          pipe_resource_reference(&shs->image[i].base.resource, NULL);
9262          pipe_resource_reference(&shs->image[i].surface_state.ref.res, NULL);
9263          free(shs->image[i].surface_state.cpu);
9264       }
9265       for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
9266          pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
9267          pipe_resource_reference(&shs->ssbo_surf_state[i].res, NULL);
9268       }
9269       for (int i = 0; i < IRIS_MAX_TEXTURES; i++) {
9270          pipe_sampler_view_reference((struct pipe_sampler_view **)
9271                                      &shs->textures[i], NULL);
9272       }
9273    }
9274 
9275    pipe_resource_reference(&ice->state.grid_size.res, NULL);
9276    pipe_resource_reference(&ice->state.grid_surf_state.res, NULL);
9277 
9278    pipe_resource_reference(&ice->state.null_fb.res, NULL);
9279    pipe_resource_reference(&ice->state.unbound_tex.res, NULL);
9280 
9281    pipe_resource_reference(&ice->state.last_res.cc_vp, NULL);
9282    pipe_resource_reference(&ice->state.last_res.sf_cl_vp, NULL);
9283    pipe_resource_reference(&ice->state.last_res.color_calc, NULL);
9284    pipe_resource_reference(&ice->state.last_res.scissor, NULL);
9285    pipe_resource_reference(&ice->state.last_res.blend, NULL);
9286    pipe_resource_reference(&ice->state.last_res.index_buffer, NULL);
9287    pipe_resource_reference(&ice->state.last_res.cs_thread_ids, NULL);
9288    pipe_resource_reference(&ice->state.last_res.cs_desc, NULL);
9289 }
9290 
9291 /* ------------------------------------------------------------------- */
9292 
9293 static void
iris_rebind_buffer(struct iris_context * ice,struct iris_resource * res)9294 iris_rebind_buffer(struct iris_context *ice,
9295                    struct iris_resource *res)
9296 {
9297    struct pipe_context *ctx = &ice->ctx;
9298    struct iris_genx_state *genx = ice->state.genx;
9299 
9300    assert(res->base.b.target == PIPE_BUFFER);
9301 
9302    /* Buffers can't be framebuffer attachments, nor display related,
9303     * and we don't have upstream Clover support.
9304     */
9305    assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
9306                                  PIPE_BIND_RENDER_TARGET |
9307                                  PIPE_BIND_BLENDABLE |
9308                                  PIPE_BIND_DISPLAY_TARGET |
9309                                  PIPE_BIND_CURSOR |
9310                                  PIPE_BIND_COMPUTE_RESOURCE |
9311                                  PIPE_BIND_GLOBAL)));
9312 
9313    if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
9314       uint64_t bound_vbs = ice->state.bound_vertex_buffers;
9315       while (bound_vbs) {
9316          const int i = u_bit_scan64(&bound_vbs);
9317          struct iris_vertex_buffer_state *state = &genx->vertex_buffers[i];
9318 
9319          /* Update the CPU struct */
9320          STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_start) == 32);
9321          STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) == 64);
9322          uint64_t *addr = (uint64_t *) &state->state[1];
9323          struct iris_bo *bo = iris_resource_bo(state->resource);
9324 
9325          if (*addr != bo->address + state->offset) {
9326             *addr = bo->address + state->offset;
9327             ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS |
9328                                 IRIS_DIRTY_VERTEX_BUFFER_FLUSHES;
9329          }
9330       }
9331    }
9332 
9333    /* We don't need to handle PIPE_BIND_INDEX_BUFFER here: we re-emit
9334     * the 3DSTATE_INDEX_BUFFER packet whenever the address changes.
9335     *
9336     * There is also no need to handle these:
9337     * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
9338     * - PIPE_BIND_QUERY_BUFFER (no persistent state references)
9339     */
9340 
9341    if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
9342       uint32_t *so_buffers = genx->so_buffers;
9343       for (unsigned i = 0; i < 4; i++,
9344            so_buffers += GENX(3DSTATE_SO_BUFFER_length)) {
9345 
9346          /* There are no other fields in bits 127:64 */
9347          uint64_t *addr = (uint64_t *) &so_buffers[2];
9348          STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_SurfaceBaseAddress_start) == 66);
9349          STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_SurfaceBaseAddress_bits) == 46);
9350 
9351          struct pipe_stream_output_target *tgt = ice->state.so_target[i];
9352          if (tgt) {
9353             struct iris_bo *bo = iris_resource_bo(tgt->buffer);
9354             if (*addr != bo->address + tgt->buffer_offset) {
9355                *addr = bo->address + tgt->buffer_offset;
9356                ice->state.dirty |= IRIS_DIRTY_SO_BUFFERS;
9357             }
9358          }
9359       }
9360    }
9361 
9362    for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
9363       struct iris_shader_state *shs = &ice->state.shaders[s];
9364       enum pipe_shader_type p_stage = stage_to_pipe(s);
9365 
9366       if (!(res->bind_stages & (1 << s)))
9367          continue;
9368 
9369       if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
9370          /* Skip constant buffer 0, it's for regular uniforms, not UBOs */
9371          uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
9372          while (bound_cbufs) {
9373             const int i = u_bit_scan(&bound_cbufs);
9374             struct pipe_shader_buffer *cbuf = &shs->constbuf[i];
9375             struct iris_state_ref *surf_state = &shs->constbuf_surf_state[i];
9376 
9377             if (res->bo == iris_resource_bo(cbuf->buffer)) {
9378                pipe_resource_reference(&surf_state->res, NULL);
9379                shs->dirty_cbufs |= 1u << i;
9380                ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
9381                                     IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
9382                ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << s;
9383             }
9384          }
9385       }
9386 
9387       if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
9388          uint32_t bound_ssbos = shs->bound_ssbos;
9389          while (bound_ssbos) {
9390             const int i = u_bit_scan(&bound_ssbos);
9391             struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
9392 
9393             if (res->bo == iris_resource_bo(ssbo->buffer)) {
9394                struct pipe_shader_buffer buf = {
9395                   .buffer = &res->base.b,
9396                   .buffer_offset = ssbo->buffer_offset,
9397                   .buffer_size = ssbo->buffer_size,
9398                };
9399                iris_set_shader_buffers(ctx, p_stage, i, 1, &buf,
9400                                        (shs->writable_ssbos >> i) & 1);
9401             }
9402          }
9403       }
9404 
9405       if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
9406          int i;
9407          BITSET_FOREACH_SET(i, shs->bound_sampler_views, IRIS_MAX_TEXTURES) {
9408             struct iris_sampler_view *isv = shs->textures[i];
9409             struct iris_bo *bo = isv->res->bo;
9410 
9411             if (update_surface_state_addrs(ice->state.surface_uploader,
9412                                            &isv->surface_state, bo)) {
9413                ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << s;
9414             }
9415          }
9416       }
9417 
9418       if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
9419          uint64_t bound_image_views = shs->bound_image_views;
9420          while (bound_image_views) {
9421             const int i = u_bit_scan64(&bound_image_views);
9422             struct iris_image_view *iv = &shs->image[i];
9423             struct iris_bo *bo = iris_resource_bo(iv->base.resource);
9424 
9425             if (update_surface_state_addrs(ice->state.surface_uploader,
9426                                            &iv->surface_state, bo)) {
9427                ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << s;
9428             }
9429          }
9430       }
9431    }
9432 }
9433 
9434 /* ------------------------------------------------------------------- */
9435 
9436 /**
9437  * Introduce a batch synchronization boundary, and update its cache coherency
9438  * status to reflect the execution of a PIPE_CONTROL command with the
9439  * specified flags.
9440  */
9441 static void
batch_mark_sync_for_pipe_control(struct iris_batch * batch,uint32_t flags)9442 batch_mark_sync_for_pipe_control(struct iris_batch *batch, uint32_t flags)
9443 {
9444    const struct intel_device_info *devinfo = batch->screen->devinfo;
9445 
9446    iris_batch_sync_boundary(batch);
9447 
9448    if ((flags & PIPE_CONTROL_CS_STALL)) {
9449       if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH))
9450          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_RENDER_WRITE);
9451 
9452       if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))
9453          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);
9454 
9455       if ((flags & PIPE_CONTROL_TILE_CACHE_FLUSH)) {
9456          /* A tile cache flush makes any C/Z data in L3 visible to memory. */
9457          const unsigned c = IRIS_DOMAIN_RENDER_WRITE;
9458          const unsigned z = IRIS_DOMAIN_DEPTH_WRITE;
9459          batch->coherent_seqnos[c][c] = batch->l3_coherent_seqnos[c];
9460          batch->coherent_seqnos[z][z] = batch->l3_coherent_seqnos[z];
9461       }
9462 
9463       if (flags & (PIPE_CONTROL_FLUSH_HDC | PIPE_CONTROL_DATA_CACHE_FLUSH)) {
9464          /* HDC and DC flushes both flush the data cache out to L3 */
9465          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DATA_WRITE);
9466       }
9467 
9468       if ((flags & PIPE_CONTROL_DATA_CACHE_FLUSH)) {
9469          /* A DC flush also flushes L3 data cache lines out to memory. */
9470          const unsigned i = IRIS_DOMAIN_DATA_WRITE;
9471          batch->coherent_seqnos[i][i] = batch->l3_coherent_seqnos[i];
9472       }
9473 
9474       if ((flags & PIPE_CONTROL_FLUSH_ENABLE))
9475          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_WRITE);
9476 
9477       if ((flags & (PIPE_CONTROL_CACHE_FLUSH_BITS |
9478                     PIPE_CONTROL_STALL_AT_SCOREBOARD))) {
9479          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_VF_READ);
9480          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_SAMPLER_READ);
9481          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_PULL_CONSTANT_READ);
9482          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_READ);
9483       }
9484    }
9485 
9486    if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH))
9487       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_RENDER_WRITE);
9488 
9489    if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))
9490       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);
9491 
9492    if (flags & (PIPE_CONTROL_FLUSH_HDC | PIPE_CONTROL_DATA_CACHE_FLUSH))
9493       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_DATA_WRITE);
9494 
9495    if ((flags & PIPE_CONTROL_FLUSH_ENABLE))
9496       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_OTHER_WRITE);
9497 
9498    if ((flags & PIPE_CONTROL_VF_CACHE_INVALIDATE))
9499       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_VF_READ);
9500 
9501    if ((flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE))
9502       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_SAMPLER_READ);
9503 
9504    /* Technically, to invalidate IRIS_DOMAIN_PULL_CONSTANT_READ, we need
9505     * both "Constant Cache Invalidate" and either "Texture Cache Invalidate"
9506     * or "Data Cache Flush" set, depending on the setting of
9507     * iris_indirect_ubos_use_sampler().
9508     *
9509     * However, "Data Cache Flush" and "Constant Cache Invalidate" will never
9510     * appear in the same PIPE_CONTROL command, because one is bottom-of-pipe
9511     * while the other is top-of-pipe.  Because we only look at one flush at
9512     * a time, we won't see both together.
9513     *
9514     * To deal with this, we mark it as invalidated when the constant cache
9515     * is invalidated, and trust the callers to also flush the other related
9516     * cache correctly at the same time.
9517     */
9518    if ((flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE))
9519       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_PULL_CONSTANT_READ);
9520 
9521    /* IRIS_DOMAIN_OTHER_READ no longer uses any caches. */
9522 
9523    if ((flags & PIPE_CONTROL_L3_RO_INVALIDATE_BITS) == PIPE_CONTROL_L3_RO_INVALIDATE_BITS) {
9524       /* If we just invalidated the read-only lines of L3, then writes from non-L3-coherent
9525        * domains will now be visible to those L3 clients.
9526        */
9527       for (unsigned i = 0; i < NUM_IRIS_DOMAINS; i++) {
9528          if (!iris_domain_is_l3_coherent(devinfo, i))
9529             batch->l3_coherent_seqnos[i] = batch->coherent_seqnos[i][i];
9530       }
9531    }
9532 }
9533 
9534 static unsigned
flags_to_post_sync_op(uint32_t flags)9535 flags_to_post_sync_op(uint32_t flags)
9536 {
9537    if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
9538       return WriteImmediateData;
9539 
9540    if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
9541       return WritePSDepthCount;
9542 
9543    if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
9544       return WriteTimestamp;
9545 
9546    return 0;
9547 }
9548 
9549 /**
9550  * Do the given flags have a Post Sync or LRI Post Sync operation?
9551  */
9552 static enum pipe_control_flags
get_post_sync_flags(enum pipe_control_flags flags)9553 get_post_sync_flags(enum pipe_control_flags flags)
9554 {
9555    flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
9556             PIPE_CONTROL_WRITE_DEPTH_COUNT |
9557             PIPE_CONTROL_WRITE_TIMESTAMP |
9558             PIPE_CONTROL_LRI_POST_SYNC_OP;
9559 
9560    /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
9561     * "LRI Post Sync Operation".  So more than one bit set would be illegal.
9562     */
9563    assert(util_bitcount(flags) <= 1);
9564 
9565    return flags;
9566 }
9567 
9568 #define IS_COMPUTE_PIPELINE(batch) (batch->name == IRIS_BATCH_COMPUTE)
9569 
9570 /**
9571  * Emit a series of PIPE_CONTROL commands, taking into account any
9572  * workarounds necessary to actually accomplish the caller's request.
9573  *
9574  * Unless otherwise noted, spec quotations in this function come from:
9575  *
9576  * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
9577  * Restrictions for PIPE_CONTROL.
9578  *
9579  * You should not use this function directly.  Use the helpers in
9580  * iris_pipe_control.c instead, which may split the pipe control further.
9581  */
9582 static void
iris_emit_raw_pipe_control(struct iris_batch * batch,const char * reason,uint32_t flags,struct iris_bo * bo,uint32_t offset,uint64_t imm)9583 iris_emit_raw_pipe_control(struct iris_batch *batch,
9584                            const char *reason,
9585                            uint32_t flags,
9586                            struct iris_bo *bo,
9587                            uint32_t offset,
9588                            uint64_t imm)
9589 {
9590    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
9591    enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
9592    enum pipe_control_flags non_lri_post_sync_flags =
9593       post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
9594 
9595 #if GFX_VER >= 12
9596    if (batch->name == IRIS_BATCH_BLITTER) {
9597       batch_mark_sync_for_pipe_control(batch, flags);
9598       iris_batch_sync_region_start(batch);
9599 
9600       assert(!(flags & PIPE_CONTROL_WRITE_DEPTH_COUNT));
9601 
9602       /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
9603       if (intel_needs_workaround(batch->screen->devinfo, 16018063123))
9604          batch_emit_fast_color_dummy_blit(batch);
9605 
9606       /* The blitter doesn't actually use PIPE_CONTROL; rather it uses the
9607        * MI_FLUSH_DW command.  However, all of our code is set up to flush
9608        * via emitting a pipe control, so we just translate it at this point,
9609        * even if it is a bit hacky.
9610        */
9611       iris_emit_cmd(batch, GENX(MI_FLUSH_DW), fd) {
9612          fd.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
9613          fd.ImmediateData = imm;
9614          fd.PostSyncOperation = flags_to_post_sync_op(flags);
9615 #if GFX_VERx10 >= 125
9616          /* TODO: This may not always be necessary */
9617          fd.FlushCCS = true;
9618 #endif
9619       }
9620       iris_batch_sync_region_end(batch);
9621       return;
9622    }
9623 #endif
9624 
9625    /* The "L3 Read Only Cache Invalidation Bit" docs say it "controls the
9626     * invalidation of the Geometry streams cached in L3 cache at the top
9627     * of the pipe".  In other words, index & vertex data that gets cached
9628     * in L3 when VERTEX_BUFFER_STATE::L3BypassDisable is set.
9629     *
9630     * Normally, invalidating L1/L2 read-only caches also invalidate their
9631     * related L3 cachelines, but this isn't the case for the VF cache.
9632     * Emulate it by setting the L3 Read Only bit when doing a VF invalidate.
9633     */
9634    if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)
9635       flags |= PIPE_CONTROL_L3_READ_ONLY_CACHE_INVALIDATE;
9636 
9637    /* Recursive PIPE_CONTROL workarounds --------------------------------
9638     * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
9639     *
9640     * We do these first because we want to look at the original operation,
9641     * rather than any workarounds we set.
9642     */
9643    if (GFX_VER == 9 && (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {
9644       /* The PIPE_CONTROL "VF Cache Invalidation Enable" bit description
9645        * lists several workarounds:
9646        *
9647        *    "Project: SKL, KBL, BXT
9648        *
9649        *     If the VF Cache Invalidation Enable is set to a 1 in a
9650        *     PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields
9651        *     sets to 0, with the VF Cache Invalidation Enable set to 0
9652        *     needs to be sent prior to the PIPE_CONTROL with VF Cache
9653        *     Invalidation Enable set to a 1."
9654        */
9655       iris_emit_raw_pipe_control(batch,
9656                                  "workaround: recursive VF cache invalidate",
9657                                  0, NULL, 0, 0);
9658    }
9659 
9660    if (GFX_VER == 9 && IS_COMPUTE_PIPELINE(batch) && post_sync_flags) {
9661       /* Project: SKL / Argument: LRI Post Sync Operation [23]
9662        *
9663        * "PIPECONTROL command with “Command Streamer Stall Enable” must be
9664        *  programmed prior to programming a PIPECONTROL command with "LRI
9665        *  Post Sync Operation" in GPGPU mode of operation (i.e when
9666        *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
9667        *
9668        * The same text exists a few rows below for Post Sync Op.
9669        */
9670       iris_emit_raw_pipe_control(batch,
9671                                  "workaround: CS stall before gpgpu post-sync",
9672                                  PIPE_CONTROL_CS_STALL, bo, offset, imm);
9673    }
9674 
9675    /* "Flush Types" workarounds ---------------------------------------------
9676     * We do these now because they may add post-sync operations or CS stalls.
9677     */
9678 
9679    if (GFX_VER < 11 && flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
9680       /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
9681        *
9682        * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
9683        *  'Write PS Depth Count' or 'Write Timestamp'."
9684        */
9685       if (!bo) {
9686          flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
9687          post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
9688          non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
9689          bo = batch->screen->workaround_address.bo;
9690          offset = batch->screen->workaround_address.offset;
9691       }
9692    }
9693 
9694    if (flags & PIPE_CONTROL_DEPTH_STALL) {
9695       /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
9696        *
9697        *    "This bit must be DISABLED for operations other than writing
9698        *     PS_DEPTH_COUNT."
9699        *
9700        * This seems like nonsense.  An Ivybridge workaround requires us to
9701        * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
9702        * operation.  Gfx8+ requires us to emit depth stalls and depth cache
9703        * flushes together.  So, it's hard to imagine this means anything other
9704        * than "we originally intended this to be used for PS_DEPTH_COUNT".
9705        *
9706        * We ignore the supposed restriction and do nothing.
9707        */
9708    }
9709 
9710    if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
9711                 PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
9712       /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
9713        *
9714        *    "This bit must be DISABLED for End-of-pipe (Read) fences,
9715        *     PS_DEPTH_COUNT or TIMESTAMP queries."
9716        *
9717        * TODO: Implement end-of-pipe checking.
9718        */
9719       assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
9720                                   PIPE_CONTROL_WRITE_TIMESTAMP)));
9721    }
9722 
9723    if (GFX_VER < 11 && (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
9724       /* From the PIPE_CONTROL instruction table, bit 1:
9725        *
9726        *    "This bit is ignored if Depth Stall Enable is set.
9727        *     Further, the render cache is not flushed even if Write Cache
9728        *     Flush Enable bit is set."
9729        *
9730        * We assert that the caller doesn't do this combination, to try and
9731        * prevent mistakes.  It shouldn't hurt the GPU, though.
9732        *
9733        * We skip this check on Gfx11+ as the "Stall at Pixel Scoreboard"
9734        * and "Render Target Flush" combo is explicitly required for BTI
9735        * update workarounds.
9736        */
9737       assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
9738                         PIPE_CONTROL_RENDER_TARGET_FLUSH)));
9739    }
9740 
9741    /* PIPE_CONTROL page workarounds ------------------------------------- */
9742 
9743    if (GFX_VER <= 8 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
9744       /* From the PIPE_CONTROL page itself:
9745        *
9746        *    "IVB, HSW, BDW
9747        *     Restriction: Pipe_control with CS-stall bit set must be issued
9748        *     before a pipe-control command that has the State Cache
9749        *     Invalidate bit set."
9750        */
9751       flags |= PIPE_CONTROL_CS_STALL;
9752    }
9753 
9754    if (flags & PIPE_CONTROL_FLUSH_LLC) {
9755       /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
9756        *
9757        *    "Project: ALL
9758        *     SW must always program Post-Sync Operation to "Write Immediate
9759        *     Data" when Flush LLC is set."
9760        *
9761        * For now, we just require the caller to do it.
9762        */
9763       assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
9764    }
9765 
9766    /* Emulate a HDC flush with a full Data Cache Flush on older hardware which
9767     * doesn't support the new lightweight flush.
9768     */
9769 #if GFX_VER < 12
9770       if (flags & PIPE_CONTROL_FLUSH_HDC)
9771          flags |= PIPE_CONTROL_DATA_CACHE_FLUSH;
9772 #endif
9773 
9774    /* "Post-Sync Operation" workarounds -------------------------------- */
9775 
9776    /* Project: All / Argument: Global Snapshot Count Reset [19]
9777     *
9778     * "This bit must not be exercised on any product.
9779     *  Requires stall bit ([20] of DW1) set."
9780     *
9781     * We don't use this, so we just assert that it isn't used.  The
9782     * PIPE_CONTROL instruction page indicates that they intended this
9783     * as a debug feature and don't think it is useful in production,
9784     * but it may actually be usable, should we ever want to.
9785     */
9786    assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
9787 
9788    if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
9789                 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
9790       /* Project: All / Arguments:
9791        *
9792        * - Generic Media State Clear [16]
9793        * - Indirect State Pointers Disable [16]
9794        *
9795        *    "Requires stall bit ([20] of DW1) set."
9796        *
9797        * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
9798        * State Clear) says:
9799        *
9800        *    "PIPECONTROL command with “Command Streamer Stall Enable” must be
9801        *     programmed prior to programming a PIPECONTROL command with "Media
9802        *     State Clear" set in GPGPU mode of operation"
9803        *
9804        * This is a subset of the earlier rule, so there's nothing to do.
9805        */
9806       flags |= PIPE_CONTROL_CS_STALL;
9807    }
9808 
9809    if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
9810       /* Project: All / Argument: Store Data Index
9811        *
9812        * "Post-Sync Operation ([15:14] of DW1) must be set to something other
9813        *  than '0'."
9814        *
9815        * For now, we just assert that the caller does this.  We might want to
9816        * automatically add a write to the workaround BO...
9817        */
9818       assert(non_lri_post_sync_flags != 0);
9819    }
9820 
9821    if (flags & PIPE_CONTROL_SYNC_GFDT) {
9822       /* Project: All / Argument: Sync GFDT
9823        *
9824        * "Post-Sync Operation ([15:14] of DW1) must be set to something other
9825        *  than '0' or 0x2520[13] must be set."
9826        *
9827        * For now, we just assert that the caller does this.
9828        */
9829       assert(non_lri_post_sync_flags != 0);
9830    }
9831 
9832    if (flags & PIPE_CONTROL_TLB_INVALIDATE) {
9833       /* Project: IVB+ / Argument: TLB inv
9834        *
9835        *    "Requires stall bit ([20] of DW1) set."
9836        *
9837        * Also, from the PIPE_CONTROL instruction table:
9838        *
9839        *    "Project: SKL+
9840        *     Post Sync Operation or CS stall must be set to ensure a TLB
9841        *     invalidation occurs.  Otherwise no cycle will occur to the TLB
9842        *     cache to invalidate."
9843        *
9844        * This is not a subset of the earlier rule, so there's nothing to do.
9845        */
9846       flags |= PIPE_CONTROL_CS_STALL;
9847    }
9848 
9849    if (GFX_VER == 9 && devinfo->gt == 4) {
9850       /* TODO: The big Skylake GT4 post sync op workaround */
9851    }
9852 
9853    /* "GPGPU specific workarounds" (both post-sync and flush) ------------ */
9854 
9855    if (IS_COMPUTE_PIPELINE(batch)) {
9856       if (GFX_VER >= 9 && (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE)) {
9857          /* SKL PRMs, Volume 7: 3D-Media-GPGPU, Programming Restrictions for
9858           * PIPE_CONTROL, Flush Types:
9859           *   "Requires stall bit ([20] of DW) set for all GPGPU Workloads."
9860           * For newer platforms this is documented in the PIPE_CONTROL
9861           * instruction page.
9862           */
9863          flags |= PIPE_CONTROL_CS_STALL;
9864       }
9865 
9866       if (GFX_VER == 8 && (post_sync_flags ||
9867                            (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
9868                                      PIPE_CONTROL_DEPTH_STALL |
9869                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
9870                                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
9871                                      PIPE_CONTROL_DATA_CACHE_FLUSH)))) {
9872          /* Project: BDW / Arguments:
9873           *
9874           * - LRI Post Sync Operation   [23]
9875           * - Post Sync Op              [15:14]
9876           * - Notify En                 [8]
9877           * - Depth Stall               [13]
9878           * - Render Target Cache Flush [12]
9879           * - Depth Cache Flush         [0]
9880           * - DC Flush Enable           [5]
9881           *
9882           *    "Requires stall bit ([20] of DW) set for all GPGPU and Media
9883           *     Workloads."
9884           */
9885          flags |= PIPE_CONTROL_CS_STALL;
9886 
9887          /* Also, from the PIPE_CONTROL instruction table, bit 20:
9888           *
9889           *    "Project: BDW
9890           *     This bit must be always set when PIPE_CONTROL command is
9891           *     programmed by GPGPU and MEDIA workloads, except for the cases
9892           *     when only Read Only Cache Invalidation bits are set (State
9893           *     Cache Invalidation Enable, Instruction cache Invalidation
9894           *     Enable, Texture Cache Invalidation Enable, Constant Cache
9895           *     Invalidation Enable). This is to WA FFDOP CG issue, this WA
9896           *     need not implemented when FF_DOP_CG is disable via "Fixed
9897           *     Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
9898           *
9899           * It sounds like we could avoid CS stalls in some cases, but we
9900           * don't currently bother.  This list isn't exactly the list above,
9901           * either...
9902           */
9903       }
9904    }
9905 
9906    /* "Stall" workarounds ----------------------------------------------
9907     * These have to come after the earlier ones because we may have added
9908     * some additional CS stalls above.
9909     */
9910 
9911    if (GFX_VER < 9 && (flags & PIPE_CONTROL_CS_STALL)) {
9912       /* Project: PRE-SKL, VLV, CHV
9913        *
9914        * "[All Stepping][All SKUs]:
9915        *
9916        *  One of the following must also be set:
9917        *
9918        *  - Render Target Cache Flush Enable ([12] of DW1)
9919        *  - Depth Cache Flush Enable ([0] of DW1)
9920        *  - Stall at Pixel Scoreboard ([1] of DW1)
9921        *  - Depth Stall ([13] of DW1)
9922        *  - Post-Sync Operation ([13] of DW1)
9923        *  - DC Flush Enable ([5] of DW1)"
9924        *
9925        * If we don't already have one of those bits set, we choose to add
9926        * "Stall at Pixel Scoreboard".  Some of the other bits require a
9927        * CS stall as a workaround (see above), which would send us into
9928        * an infinite recursion of PIPE_CONTROLs.  "Stall at Pixel Scoreboard"
9929        * appears to be safe, so we choose that.
9930        */
9931       const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
9932                                PIPE_CONTROL_DEPTH_CACHE_FLUSH |
9933                                PIPE_CONTROL_WRITE_IMMEDIATE |
9934                                PIPE_CONTROL_WRITE_DEPTH_COUNT |
9935                                PIPE_CONTROL_WRITE_TIMESTAMP |
9936                                PIPE_CONTROL_STALL_AT_SCOREBOARD |
9937                                PIPE_CONTROL_DEPTH_STALL |
9938                                PIPE_CONTROL_DATA_CACHE_FLUSH;
9939       if (!(flags & wa_bits))
9940          flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
9941    }
9942 
9943    if (INTEL_NEEDS_WA_1409600907 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
9944       /* Wa_1409600907:
9945        *
9946        * "PIPE_CONTROL with Depth Stall Enable bit must be set
9947        * with any PIPE_CONTROL with Depth Flush Enable bit set.
9948        */
9949       flags |= PIPE_CONTROL_DEPTH_STALL;
9950    }
9951 
9952    /* Wa_14014966230: For COMPUTE Workload - Any PIPE_CONTROL command with
9953     * POST_SYNC Operation Enabled MUST be preceded by a PIPE_CONTROL
9954     * with CS_STALL Bit set (with No POST_SYNC ENABLED)
9955     */
9956    if (intel_device_info_is_adln(devinfo) &&
9957        IS_COMPUTE_PIPELINE(batch) &&
9958        flags_to_post_sync_op(flags) != NoWrite) {
9959       iris_emit_raw_pipe_control(batch, "Wa_14014966230",
9960                                  PIPE_CONTROL_CS_STALL, NULL, 0, 0);
9961    }
9962 
9963    batch_mark_sync_for_pipe_control(batch, flags);
9964 
9965 #if INTEL_NEEDS_WA_14010840176
9966    /* "If the intention of “constant cache invalidate” is
9967     *  to invalidate the L1 cache (which can cache constants), use “HDC
9968     *  pipeline flush” instead of Constant Cache invalidate command."
9969     *
9970     * "If L3 invalidate is needed, the w/a should be to set state invalidate
9971     * in the pipe control command, in addition to the HDC pipeline flush."
9972     */
9973    if (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) {
9974       flags &= ~PIPE_CONTROL_CONST_CACHE_INVALIDATE;
9975       flags |= PIPE_CONTROL_FLUSH_HDC | PIPE_CONTROL_STATE_CACHE_INVALIDATE;
9976    }
9977 #endif
9978 
9979    /* Emit --------------------------------------------------------------- */
9980 
9981    if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
9982       fprintf(stderr,
9983               "  PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
9984               (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
9985               (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
9986               (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
9987               (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
9988               (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
9989               (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
9990               (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
9991               (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
9992               (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
9993               (flags & PIPE_CONTROL_TILE_CACHE_FLUSH) ? "Tile " : "",
9994               (flags & PIPE_CONTROL_CCS_CACHE_FLUSH) ? "CCS " : "",
9995               (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
9996               (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
9997               (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
9998               (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
9999               (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
10000               (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
10001               (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
10002                  "SnapRes" : "",
10003               (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
10004                   "ISPDis" : "",
10005               (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
10006               (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
10007               (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
10008               (flags & PIPE_CONTROL_FLUSH_HDC) ? "HDC " : "",
10009               (flags & PIPE_CONTROL_PSS_STALL_SYNC) ? "PSS " : "",
10010               (flags & PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH) ? "UntypedDataPortCache " : "",
10011               imm, reason);
10012    }
10013 
10014    iris_batch_sync_region_start(batch);
10015 
10016    const bool trace_pc =
10017       (flags & (PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CACHE_INVALIDATE_BITS)) != 0;
10018 
10019    if (trace_pc)
10020       trace_intel_begin_stall(&batch->trace);
10021 
10022    iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
10023 #if GFX_VERx10 >= 125
10024       pc.PSSStallSyncEnable = flags & PIPE_CONTROL_PSS_STALL_SYNC;
10025 #endif
10026 #if GFX_VER == 12
10027       pc.TileCacheFlushEnable = flags & PIPE_CONTROL_TILE_CACHE_FLUSH;
10028 #endif
10029 #if GFX_VER > 11
10030       pc.HDCPipelineFlushEnable = flags & PIPE_CONTROL_FLUSH_HDC;
10031 #endif
10032 #if GFX_VERx10 >= 125
10033       pc.UntypedDataPortCacheFlushEnable =
10034          (flags & (PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
10035                    PIPE_CONTROL_FLUSH_HDC |
10036                    PIPE_CONTROL_DATA_CACHE_FLUSH)) &&
10037          IS_COMPUTE_PIPELINE(batch);
10038       pc.HDCPipelineFlushEnable |= pc.UntypedDataPortCacheFlushEnable;
10039       pc.CCSFlushEnable |= flags & PIPE_CONTROL_CCS_CACHE_FLUSH;
10040 #endif
10041       pc.LRIPostSyncOperation = NoLRIOperation;
10042       pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
10043       pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
10044       pc.StoreDataIndex = 0;
10045       pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
10046 #if GFX_VERx10 < 125
10047       pc.GlobalSnapshotCountReset =
10048          flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
10049 #endif
10050       pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
10051 #if GFX_VERx10 < 200
10052       pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
10053 #endif
10054       pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
10055       pc.RenderTargetCacheFlushEnable =
10056          flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
10057       pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
10058       pc.StateCacheInvalidationEnable =
10059          flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
10060 #if GFX_VER >= 12
10061       pc.L3ReadOnlyCacheInvalidationEnable =
10062          flags & PIPE_CONTROL_L3_READ_ONLY_CACHE_INVALIDATE;
10063 #endif
10064       pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
10065       pc.ConstantCacheInvalidationEnable =
10066          flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
10067       pc.PostSyncOperation = flags_to_post_sync_op(flags);
10068       pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
10069       pc.InstructionCacheInvalidateEnable =
10070          flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
10071       pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
10072       pc.IndirectStatePointersDisable =
10073          flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
10074       pc.TextureCacheInvalidationEnable =
10075          flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
10076       pc.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
10077       pc.ImmediateData = imm;
10078    }
10079 
10080    if (trace_pc) {
10081       trace_intel_end_stall(&batch->trace, flags,
10082                             iris_utrace_pipe_flush_bit_to_ds_stall_flag,
10083                             reason,0,0,0);
10084    }
10085 
10086    iris_batch_sync_region_end(batch);
10087 }
10088 
10089 #if GFX_VER == 9
10090 /**
10091  * Preemption on Gfx9 has to be enabled or disabled in various cases.
10092  *
10093  * See these workarounds for preemption:
10094  *  - WaDisableMidObjectPreemptionForGSLineStripAdj
10095  *  - WaDisableMidObjectPreemptionForTrifanOrPolygon
10096  *  - WaDisableMidObjectPreemptionForLineLoop
10097  *  - WA#0798
10098  *
10099  * We don't put this in the vtable because it's only used on Gfx9.
10100  */
10101 void
gfx9_toggle_preemption(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw)10102 gfx9_toggle_preemption(struct iris_context *ice,
10103                        struct iris_batch *batch,
10104                        const struct pipe_draw_info *draw)
10105 {
10106    struct iris_genx_state *genx = ice->state.genx;
10107    bool object_preemption = true;
10108 
10109    /* WaDisableMidObjectPreemptionForGSLineStripAdj
10110     *
10111     *    "WA: Disable mid-draw preemption when draw-call is a linestrip_adj
10112     *     and GS is enabled."
10113     */
10114    if (draw->mode == MESA_PRIM_LINE_STRIP_ADJACENCY &&
10115        ice->shaders.prog[MESA_SHADER_GEOMETRY])
10116       object_preemption = false;
10117 
10118    /* WaDisableMidObjectPreemptionForTrifanOrPolygon
10119     *
10120     *    "TriFan miscompare in Execlist Preemption test. Cut index that is
10121     *     on a previous context. End the previous, the resume another context
10122     *     with a tri-fan or polygon, and the vertex count is corrupted. If we
10123     *     prempt again we will cause corruption.
10124     *
10125     *     WA: Disable mid-draw preemption when draw-call has a tri-fan."
10126     */
10127    if (draw->mode == MESA_PRIM_TRIANGLE_FAN)
10128       object_preemption = false;
10129 
10130    /* WaDisableMidObjectPreemptionForLineLoop
10131     *
10132     *    "VF Stats Counters Missing a vertex when preemption enabled.
10133     *
10134     *     WA: Disable mid-draw preemption when the draw uses a lineloop
10135     *     topology."
10136     */
10137    if (draw->mode == MESA_PRIM_LINE_LOOP)
10138       object_preemption = false;
10139 
10140    /* WA#0798
10141     *
10142     *    "VF is corrupting GAFS data when preempted on an instance boundary
10143     *     and replayed with instancing enabled.
10144     *
10145     *     WA: Disable preemption when using instanceing."
10146     */
10147    if (draw->instance_count > 1)
10148       object_preemption = false;
10149 
10150    if (genx->object_preemption != object_preemption) {
10151       iris_enable_obj_preemption(batch, object_preemption);
10152       genx->object_preemption = object_preemption;
10153    }
10154 }
10155 #endif
10156 
10157 static void
iris_lost_genx_state(struct iris_context * ice,struct iris_batch * batch)10158 iris_lost_genx_state(struct iris_context *ice, struct iris_batch *batch)
10159 {
10160    struct iris_genx_state *genx = ice->state.genx;
10161 
10162 #if INTEL_NEEDS_WA_1808121037
10163    genx->depth_reg_mode = IRIS_DEPTH_REG_MODE_UNKNOWN;
10164 #endif
10165 
10166    memset(genx->last_index_buffer, 0, sizeof(genx->last_index_buffer));
10167 }
10168 
10169 static void
iris_emit_mi_report_perf_count(struct iris_batch * batch,struct iris_bo * bo,uint32_t offset_in_bytes,uint32_t report_id)10170 iris_emit_mi_report_perf_count(struct iris_batch *batch,
10171                                struct iris_bo *bo,
10172                                uint32_t offset_in_bytes,
10173                                uint32_t report_id)
10174 {
10175    iris_batch_sync_region_start(batch);
10176    iris_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
10177       mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes,
10178                                    IRIS_DOMAIN_OTHER_WRITE);
10179       mi_rpc.ReportID = report_id;
10180    }
10181    iris_batch_sync_region_end(batch);
10182 }
10183 
10184 /**
10185  * Update the pixel hashing modes that determine the balancing of PS threads
10186  * across subslices and slices.
10187  *
10188  * \param width Width bound of the rendering area (already scaled down if \p
10189  *              scale is greater than 1).
10190  * \param height Height bound of the rendering area (already scaled down if \p
10191  *               scale is greater than 1).
10192  * \param scale The number of framebuffer samples that could potentially be
10193  *              affected by an individual channel of the PS thread.  This is
10194  *              typically one for single-sampled rendering, but for operations
10195  *              like CCS resolves and fast clears a single PS invocation may
10196  *              update a huge number of pixels, in which case a finer
10197  *              balancing is desirable in order to maximally utilize the
10198  *              bandwidth available.  UINT_MAX can be used as shorthand for
10199  *              "finest hashing mode available".
10200  */
10201 void
genX(emit_hashing_mode)10202 genX(emit_hashing_mode)(struct iris_context *ice, struct iris_batch *batch,
10203                         unsigned width, unsigned height, unsigned scale)
10204 {
10205 #if GFX_VER == 9
10206    const struct intel_device_info *devinfo = batch->screen->devinfo;
10207    const unsigned slice_hashing[] = {
10208       /* Because all Gfx9 platforms with more than one slice require
10209        * three-way subslice hashing, a single "normal" 16x16 slice hashing
10210        * block is guaranteed to suffer from substantial imbalance, with one
10211        * subslice receiving twice as much work as the other two in the
10212        * slice.
10213        *
10214        * The performance impact of that would be particularly severe when
10215        * three-way hashing is also in use for slice balancing (which is the
10216        * case for all Gfx9 GT4 platforms), because one of the slices
10217        * receives one every three 16x16 blocks in either direction, which
10218        * is roughly the periodicity of the underlying subslice imbalance
10219        * pattern ("roughly" because in reality the hardware's
10220        * implementation of three-way hashing doesn't do exact modulo 3
10221        * arithmetic, which somewhat decreases the magnitude of this effect
10222        * in practice).  This leads to a systematic subslice imbalance
10223        * within that slice regardless of the size of the primitive.  The
10224        * 32x32 hashing mode guarantees that the subslice imbalance within a
10225        * single slice hashing block is minimal, largely eliminating this
10226        * effect.
10227        */
10228       _32x32,
10229       /* Finest slice hashing mode available. */
10230       NORMAL
10231    };
10232    const unsigned subslice_hashing[] = {
10233       /* 16x16 would provide a slight cache locality benefit especially
10234        * visible in the sampler L1 cache efficiency of low-bandwidth
10235        * non-LLC platforms, but it comes at the cost of greater subslice
10236        * imbalance for primitives of dimensions approximately intermediate
10237        * between 16x4 and 16x16.
10238        */
10239       _16x4,
10240       /* Finest subslice hashing mode available. */
10241       _8x4
10242    };
10243    /* Dimensions of the smallest hashing block of a given hashing mode.  If
10244     * the rendering area is smaller than this there can't possibly be any
10245     * benefit from switching to this mode, so we optimize out the
10246     * transition.
10247     */
10248    const unsigned min_size[][2] = {
10249       { 16, 4 },
10250       { 8, 4 }
10251    };
10252    const unsigned idx = scale > 1;
10253 
10254    if (width > min_size[idx][0] || height > min_size[idx][1]) {
10255       iris_emit_raw_pipe_control(batch,
10256                                  "workaround: CS stall before GT_MODE LRI",
10257                                  PIPE_CONTROL_STALL_AT_SCOREBOARD |
10258                                  PIPE_CONTROL_CS_STALL,
10259                                  NULL, 0, 0);
10260 
10261       iris_emit_reg(batch, GENX(GT_MODE), reg) {
10262          reg.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
10263          reg.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
10264          reg.SubsliceHashing = subslice_hashing[idx];
10265          reg.SubsliceHashingMask = -1;
10266       };
10267 
10268       ice->state.current_hash_scale = scale;
10269    }
10270 #endif
10271 }
10272 
10273 static void
iris_set_frontend_noop(struct pipe_context * ctx,bool enable)10274 iris_set_frontend_noop(struct pipe_context *ctx, bool enable)
10275 {
10276    struct iris_context *ice = (struct iris_context *) ctx;
10277 
10278    if (iris_batch_prepare_noop(&ice->batches[IRIS_BATCH_RENDER], enable)) {
10279       ice->state.dirty |= IRIS_ALL_DIRTY_FOR_RENDER;
10280       ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_RENDER;
10281    }
10282 
10283    if (iris_batch_prepare_noop(&ice->batches[IRIS_BATCH_COMPUTE], enable)) {
10284       ice->state.dirty |= IRIS_ALL_DIRTY_FOR_COMPUTE;
10285       ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_COMPUTE;
10286    }
10287 }
10288 
10289 void
genX(init_screen_state)10290 genX(init_screen_state)(struct iris_screen *screen)
10291 {
10292    assert(screen->devinfo->verx10 == GFX_VERx10);
10293    screen->vtbl.destroy_state = iris_destroy_state;
10294    screen->vtbl.init_render_context = iris_init_render_context;
10295    screen->vtbl.init_compute_context = iris_init_compute_context;
10296    screen->vtbl.init_copy_context = iris_init_copy_context;
10297    screen->vtbl.upload_render_state = iris_upload_render_state;
10298    screen->vtbl.upload_indirect_render_state = iris_upload_indirect_render_state;
10299    screen->vtbl.upload_indirect_shader_render_state = iris_upload_indirect_shader_render_state;
10300    screen->vtbl.update_binder_address = iris_update_binder_address;
10301    screen->vtbl.upload_compute_state = iris_upload_compute_state;
10302    screen->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control;
10303    screen->vtbl.rewrite_compute_walker_pc = iris_rewrite_compute_walker_pc;
10304    screen->vtbl.emit_mi_report_perf_count = iris_emit_mi_report_perf_count;
10305    screen->vtbl.rebind_buffer = iris_rebind_buffer;
10306    screen->vtbl.load_register_reg32 = iris_load_register_reg32;
10307    screen->vtbl.load_register_reg64 = iris_load_register_reg64;
10308    screen->vtbl.load_register_imm32 = iris_load_register_imm32;
10309    screen->vtbl.load_register_imm64 = iris_load_register_imm64;
10310    screen->vtbl.load_register_mem32 = iris_load_register_mem32;
10311    screen->vtbl.load_register_mem64 = iris_load_register_mem64;
10312    screen->vtbl.store_register_mem32 = iris_store_register_mem32;
10313    screen->vtbl.store_register_mem64 = iris_store_register_mem64;
10314    screen->vtbl.store_data_imm32 = iris_store_data_imm32;
10315    screen->vtbl.store_data_imm64 = iris_store_data_imm64;
10316    screen->vtbl.copy_mem_mem = iris_copy_mem_mem;
10317    screen->vtbl.derived_program_state_size = iris_derived_program_state_size;
10318    screen->vtbl.store_derived_program_state = iris_store_derived_program_state;
10319    screen->vtbl.create_so_decl_list = iris_create_so_decl_list;
10320    screen->vtbl.populate_vs_key = iris_populate_vs_key;
10321    screen->vtbl.populate_tcs_key = iris_populate_tcs_key;
10322    screen->vtbl.populate_tes_key = iris_populate_tes_key;
10323    screen->vtbl.populate_gs_key = iris_populate_gs_key;
10324    screen->vtbl.populate_fs_key = iris_populate_fs_key;
10325    screen->vtbl.populate_cs_key = iris_populate_cs_key;
10326    screen->vtbl.lost_genx_state = iris_lost_genx_state;
10327    screen->vtbl.disable_rhwo_optimization = iris_disable_rhwo_optimization;
10328 }
10329 
10330 void
genX(init_state)10331 genX(init_state)(struct iris_context *ice)
10332 {
10333    struct pipe_context *ctx = &ice->ctx;
10334    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
10335 
10336    ctx->create_blend_state = iris_create_blend_state;
10337    ctx->create_depth_stencil_alpha_state = iris_create_zsa_state;
10338    ctx->create_rasterizer_state = iris_create_rasterizer_state;
10339    ctx->create_sampler_state = iris_create_sampler_state;
10340    ctx->create_sampler_view = iris_create_sampler_view;
10341    ctx->create_surface = iris_create_surface;
10342    ctx->create_vertex_elements_state = iris_create_vertex_elements;
10343    ctx->bind_blend_state = iris_bind_blend_state;
10344    ctx->bind_depth_stencil_alpha_state = iris_bind_zsa_state;
10345    ctx->bind_sampler_states = iris_bind_sampler_states;
10346    ctx->bind_rasterizer_state = iris_bind_rasterizer_state;
10347    ctx->bind_vertex_elements_state = iris_bind_vertex_elements_state;
10348    ctx->delete_blend_state = iris_delete_state;
10349    ctx->delete_depth_stencil_alpha_state = iris_delete_state;
10350    ctx->delete_rasterizer_state = iris_delete_state;
10351    ctx->delete_sampler_state = iris_delete_state;
10352    ctx->delete_vertex_elements_state = iris_delete_state;
10353    ctx->set_blend_color = iris_set_blend_color;
10354    ctx->set_clip_state = iris_set_clip_state;
10355    ctx->set_constant_buffer = iris_set_constant_buffer;
10356    ctx->set_shader_buffers = iris_set_shader_buffers;
10357    ctx->set_shader_images = iris_set_shader_images;
10358    ctx->set_sampler_views = iris_set_sampler_views;
10359    ctx->set_compute_resources = iris_set_compute_resources;
10360    ctx->set_global_binding = iris_set_global_binding;
10361    ctx->set_tess_state = iris_set_tess_state;
10362    ctx->set_patch_vertices = iris_set_patch_vertices;
10363    ctx->set_framebuffer_state = iris_set_framebuffer_state;
10364    ctx->set_polygon_stipple = iris_set_polygon_stipple;
10365    ctx->set_sample_mask = iris_set_sample_mask;
10366    ctx->set_scissor_states = iris_set_scissor_states;
10367    ctx->set_stencil_ref = iris_set_stencil_ref;
10368    ctx->set_vertex_buffers = iris_set_vertex_buffers;
10369    ctx->set_viewport_states = iris_set_viewport_states;
10370    ctx->sampler_view_destroy = iris_sampler_view_destroy;
10371    ctx->surface_destroy = iris_surface_destroy;
10372    ctx->draw_vbo = iris_draw_vbo;
10373    ctx->launch_grid = iris_launch_grid;
10374    ctx->create_stream_output_target = iris_create_stream_output_target;
10375    ctx->stream_output_target_destroy = iris_stream_output_target_destroy;
10376    ctx->set_stream_output_targets = iris_set_stream_output_targets;
10377    ctx->set_frontend_noop = iris_set_frontend_noop;
10378 
10379    ice->state.dirty = ~0ull;
10380    ice->state.stage_dirty = ~0ull;
10381 
10382    ice->state.statistics_counters_enabled = true;
10383 
10384    ice->state.sample_mask = 0xffff;
10385    ice->state.num_viewports = 1;
10386    ice->state.prim_mode = MESA_PRIM_COUNT;
10387    ice->state.genx = calloc(1, sizeof(struct iris_genx_state));
10388    ice->draw.derived_params.drawid = -1;
10389 
10390 #if GFX_VERx10 >= 120
10391    ice->state.genx->object_preemption = true;
10392 #endif
10393 
10394    /* Make a 1x1x1 null surface for unbound textures */
10395    void *null_surf_map =
10396       upload_state(ice->state.surface_uploader, &ice->state.unbound_tex,
10397                    4 * GENX(RENDER_SURFACE_STATE_length), 64);
10398    isl_null_fill_state(&screen->isl_dev, null_surf_map,
10399                        .size = isl_extent3d(1, 1, 1));
10400    ice->state.unbound_tex.offset +=
10401       iris_bo_offset_from_base_address(iris_resource_bo(ice->state.unbound_tex.res));
10402 
10403    /* Default all scissor rectangles to be empty regions. */
10404    for (int i = 0; i < IRIS_MAX_VIEWPORTS; i++) {
10405       ice->state.scissors[i] = (struct pipe_scissor_state) {
10406          .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
10407       };
10408    }
10409 }
10410