1 /*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23 /**
24 * @file iris_state.c
25 *
26 * ============================= GENXML CODE =============================
27 * [This file is compiled once per generation.]
28 * =======================================================================
29 *
30 * This is the main state upload code.
31 *
32 * Gallium uses Constant State Objects, or CSOs, for most state. Large,
33 * complex, or highly reusable state can be created once, and bound and
34 * rebound multiple times. This is modeled with the pipe->create_*_state()
35 * and pipe->bind_*_state() hooks. Highly dynamic or inexpensive state is
36 * streamed out on the fly, via pipe->set_*_state() hooks.
37 *
38 * OpenGL involves frequently mutating context state, which is mirrored in
39 * core Mesa by highly mutable data structures. However, most applications
40 * typically draw the same things over and over - from frame to frame, most
41 * of the same objects are still visible and need to be redrawn. So, rather
42 * than inventing new state all the time, applications usually mutate to swap
43 * between known states that we've seen before.
44 *
45 * Gallium isolates us from this mutation by tracking API state, and
46 * distilling it into a set of Constant State Objects, or CSOs. Large,
47 * complex, or typically reusable state can be created once, then reused
48 * multiple times. Drivers can create and store their own associated data.
49 * This create/bind model corresponds to the pipe->create_*_state() and
50 * pipe->bind_*_state() driver hooks.
51 *
52 * Some state is cheap to create, or expected to be highly dynamic. Rather
53 * than creating and caching piles of CSOs for these, Gallium simply streams
54 * them out, via the pipe->set_*_state() driver hooks.
55 *
56 * To reduce draw time overhead, we try to compute as much state at create
57 * time as possible. Wherever possible, we translate the Gallium pipe state
58 * to 3DSTATE commands, and store those commands in the CSO. At draw time,
59 * we can simply memcpy them into a batch buffer.
60 *
61 * No hardware matches the abstraction perfectly, so some commands require
62 * information from multiple CSOs. In this case, we can store two copies
63 * of the packet (one in each CSO), and simply | together their DWords at
64 * draw time. Sometimes the second set is trivial (one or two fields), so
65 * we simply pack it at draw time.
66 *
67 * There are two main components in the file below. First, the CSO hooks
68 * create/bind/track state. The second are the draw-time upload functions,
69 * iris_upload_render_state() and iris_upload_compute_state(), which read
70 * the context state and emit the commands into the actual batch.
71 */
72
73 #include <stdio.h>
74 #include <errno.h>
75
76 #ifdef HAVE_VALGRIND
77 #include <valgrind.h>
78 #include <memcheck.h>
79 #define VG(x) x
80 #else
81 #define VG(x)
82 #endif
83
84 #include "pipe/p_defines.h"
85 #include "pipe/p_state.h"
86 #include "pipe/p_context.h"
87 #include "pipe/p_screen.h"
88 #include "util/u_dual_blend.h"
89 #include "util/u_inlines.h"
90 #include "util/format/u_format.h"
91 #include "util/u_framebuffer.h"
92 #include "util/u_transfer.h"
93 #include "util/u_upload_mgr.h"
94 #include "util/u_viewport.h"
95 #include "util/u_memory.h"
96 #include "util/u_trace_gallium.h"
97 #include "nir.h"
98 #include "intel/common/intel_aux_map.h"
99 #include "intel/common/intel_compute_slm.h"
100 #include "intel/common/intel_l3_config.h"
101 #include "intel/common/intel_sample_positions.h"
102 #include "intel/ds/intel_tracepoints.h"
103 #include "iris_batch.h"
104 #include "iris_context.h"
105 #include "iris_defines.h"
106 #include "iris_pipe.h"
107 #include "iris_resource.h"
108 #include "iris_utrace.h"
109
110 #include "iris_genx_macros.h"
111
112 #if GFX_VER >= 9
113 #include "intel/compiler/brw_compiler.h"
114 #include "intel/common/intel_genX_state_brw.h"
115 #else
116 #include "intel/compiler/elk/elk_compiler.h"
117 #include "intel/common/intel_genX_state_elk.h"
118 #endif
119
120 #include "intel/common/intel_guardband.h"
121 #include "intel/common/intel_pixel_hash.h"
122 #include "intel/common/intel_tiled_render.h"
123
124 /**
125 * Statically assert that PIPE_* enums match the hardware packets.
126 * (As long as they match, we don't need to translate them.)
127 */
pipe_asserts()128 UNUSED static void pipe_asserts()
129 {
130 #define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
131
132 /* pipe_logicop happens to match the hardware. */
133 PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
134 PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
135 PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
136 PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
137 PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
138 PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
139 PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
140 PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
141 PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
142 PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
143 PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
144 PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
145 PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
146 PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
147 PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
148 PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
149
150 /* pipe_blend_func happens to match the hardware. */
151 PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
152 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
153 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
154 PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
155 PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
156 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
157 PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
158 PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
159 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
160 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
161 PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
162 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
163 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
164 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
165 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
166 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
167 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
168 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
169 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
170
171 /* pipe_blend_func happens to match the hardware. */
172 PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
173 PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
174 PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
175 PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
176 PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
177
178 /* pipe_stencil_op happens to match the hardware. */
179 PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
180 PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
181 PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
182 PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
183 PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
184 PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
185 PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
186 PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
187
188 /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
189 PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
190 PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
191 #undef PIPE_ASSERT
192 }
193
194 static unsigned
translate_prim_type(enum mesa_prim prim,uint8_t verts_per_patch)195 translate_prim_type(enum mesa_prim prim, uint8_t verts_per_patch)
196 {
197 static const unsigned map[] = {
198 [MESA_PRIM_POINTS] = _3DPRIM_POINTLIST,
199 [MESA_PRIM_LINES] = _3DPRIM_LINELIST,
200 [MESA_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,
201 [MESA_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,
202 [MESA_PRIM_TRIANGLES] = _3DPRIM_TRILIST,
203 [MESA_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
204 [MESA_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
205 [MESA_PRIM_QUADS] = _3DPRIM_QUADLIST,
206 [MESA_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
207 [MESA_PRIM_POLYGON] = _3DPRIM_POLYGON,
208 [MESA_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
209 [MESA_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
210 [MESA_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
211 [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
212 [MESA_PRIM_PATCHES] = _3DPRIM_PATCHLIST_1 - 1,
213 };
214
215 return map[prim] + (prim == MESA_PRIM_PATCHES ? verts_per_patch : 0);
216 }
217
218 static unsigned
translate_compare_func(enum pipe_compare_func pipe_func)219 translate_compare_func(enum pipe_compare_func pipe_func)
220 {
221 static const unsigned map[] = {
222 [PIPE_FUNC_NEVER] = COMPAREFUNCTION_NEVER,
223 [PIPE_FUNC_LESS] = COMPAREFUNCTION_LESS,
224 [PIPE_FUNC_EQUAL] = COMPAREFUNCTION_EQUAL,
225 [PIPE_FUNC_LEQUAL] = COMPAREFUNCTION_LEQUAL,
226 [PIPE_FUNC_GREATER] = COMPAREFUNCTION_GREATER,
227 [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
228 [PIPE_FUNC_GEQUAL] = COMPAREFUNCTION_GEQUAL,
229 [PIPE_FUNC_ALWAYS] = COMPAREFUNCTION_ALWAYS,
230 };
231 return map[pipe_func];
232 }
233
234 static unsigned
translate_shadow_func(enum pipe_compare_func pipe_func)235 translate_shadow_func(enum pipe_compare_func pipe_func)
236 {
237 /* Gallium specifies the result of shadow comparisons as:
238 *
239 * 1 if ref <op> texel,
240 * 0 otherwise.
241 *
242 * The hardware does:
243 *
244 * 0 if texel <op> ref,
245 * 1 otherwise.
246 *
247 * So we need to flip the operator and also negate.
248 */
249 static const unsigned map[] = {
250 [PIPE_FUNC_NEVER] = PREFILTEROP_ALWAYS,
251 [PIPE_FUNC_LESS] = PREFILTEROP_LEQUAL,
252 [PIPE_FUNC_EQUAL] = PREFILTEROP_NOTEQUAL,
253 [PIPE_FUNC_LEQUAL] = PREFILTEROP_LESS,
254 [PIPE_FUNC_GREATER] = PREFILTEROP_GEQUAL,
255 [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
256 [PIPE_FUNC_GEQUAL] = PREFILTEROP_GREATER,
257 [PIPE_FUNC_ALWAYS] = PREFILTEROP_NEVER,
258 };
259 return map[pipe_func];
260 }
261
262 static unsigned
translate_cull_mode(unsigned pipe_face)263 translate_cull_mode(unsigned pipe_face)
264 {
265 static const unsigned map[4] = {
266 [PIPE_FACE_NONE] = CULLMODE_NONE,
267 [PIPE_FACE_FRONT] = CULLMODE_FRONT,
268 [PIPE_FACE_BACK] = CULLMODE_BACK,
269 [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
270 };
271 return map[pipe_face];
272 }
273
274 static unsigned
translate_fill_mode(unsigned pipe_polymode)275 translate_fill_mode(unsigned pipe_polymode)
276 {
277 static const unsigned map[4] = {
278 [PIPE_POLYGON_MODE_FILL] = FILL_MODE_SOLID,
279 [PIPE_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME,
280 [PIPE_POLYGON_MODE_POINT] = FILL_MODE_POINT,
281 [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
282 };
283 return map[pipe_polymode];
284 }
285
286 static unsigned
translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)287 translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
288 {
289 static const unsigned map[] = {
290 [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
291 [PIPE_TEX_MIPFILTER_LINEAR] = MIPFILTER_LINEAR,
292 [PIPE_TEX_MIPFILTER_NONE] = MIPFILTER_NONE,
293 };
294 return map[pipe_mip];
295 }
296
297 static uint32_t
translate_wrap(unsigned pipe_wrap)298 translate_wrap(unsigned pipe_wrap)
299 {
300 static const unsigned map[] = {
301 [PIPE_TEX_WRAP_REPEAT] = TCM_WRAP,
302 [PIPE_TEX_WRAP_CLAMP] = TCM_HALF_BORDER,
303 [PIPE_TEX_WRAP_CLAMP_TO_EDGE] = TCM_CLAMP,
304 [PIPE_TEX_WRAP_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
305 [PIPE_TEX_WRAP_MIRROR_REPEAT] = TCM_MIRROR,
306 [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
307
308 /* These are unsupported. */
309 [PIPE_TEX_WRAP_MIRROR_CLAMP] = -1,
310 [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
311 };
312 return map[pipe_wrap];
313 }
314
315 /**
316 * Allocate space for some indirect state.
317 *
318 * Return a pointer to the map (to fill it out) and a state ref (for
319 * referring to the state in GPU commands).
320 */
321 static void *
upload_state(struct u_upload_mgr * uploader,struct iris_state_ref * ref,unsigned size,unsigned alignment)322 upload_state(struct u_upload_mgr *uploader,
323 struct iris_state_ref *ref,
324 unsigned size,
325 unsigned alignment)
326 {
327 void *p = NULL;
328 u_upload_alloc(uploader, 0, size, alignment, &ref->offset, &ref->res, &p);
329 return p;
330 }
331
332 /**
333 * Stream out temporary/short-lived state.
334 *
335 * This allocates space, pins the BO, and includes the BO address in the
336 * returned offset (which works because all state lives in 32-bit memory
337 * zones).
338 */
339 static uint32_t *
stream_state(struct iris_batch * batch,struct u_upload_mgr * uploader,struct pipe_resource ** out_res,unsigned size,unsigned alignment,uint32_t * out_offset)340 stream_state(struct iris_batch *batch,
341 struct u_upload_mgr *uploader,
342 struct pipe_resource **out_res,
343 unsigned size,
344 unsigned alignment,
345 uint32_t *out_offset)
346 {
347 void *ptr = NULL;
348
349 u_upload_alloc(uploader, 0, size, alignment, out_offset, out_res, &ptr);
350
351 struct iris_bo *bo = iris_resource_bo(*out_res);
352 iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
353
354 iris_record_state_size(batch->state_sizes,
355 bo->address + *out_offset, size);
356
357 *out_offset += iris_bo_offset_from_base_address(bo);
358
359 return ptr;
360 }
361
362 /**
363 * stream_state() + memcpy.
364 */
365 static uint32_t
emit_state(struct iris_batch * batch,struct u_upload_mgr * uploader,struct pipe_resource ** out_res,const void * data,unsigned size,unsigned alignment)366 emit_state(struct iris_batch *batch,
367 struct u_upload_mgr *uploader,
368 struct pipe_resource **out_res,
369 const void *data,
370 unsigned size,
371 unsigned alignment)
372 {
373 unsigned offset = 0;
374 uint32_t *map =
375 stream_state(batch, uploader, out_res, size, alignment, &offset);
376
377 if (map)
378 memcpy(map, data, size);
379
380 return offset;
381 }
382
383 /**
384 * Did field 'x' change between 'old_cso' and 'new_cso'?
385 *
386 * (If so, we may want to set some dirty flags.)
387 */
388 #define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
389 #define cso_changed_memcmp(x) \
390 (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
391 #define cso_changed_memcmp_elts(x, n) \
392 (!old_cso || memcmp(old_cso->x, new_cso->x, n * sizeof(old_cso->x[0])) != 0)
393
394 static void
flush_before_state_base_change(struct iris_batch * batch)395 flush_before_state_base_change(struct iris_batch *batch)
396 {
397 /* Wa_14014427904 - We need additional invalidate/flush when
398 * emitting NP state commands with ATS-M in compute mode.
399 */
400 bool atsm_compute = intel_device_info_is_atsm(batch->screen->devinfo) &&
401 batch->name == IRIS_BATCH_COMPUTE;
402 uint32_t np_state_wa_bits =
403 PIPE_CONTROL_CS_STALL |
404 PIPE_CONTROL_STATE_CACHE_INVALIDATE |
405 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
406 PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
407 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
408 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
409 PIPE_CONTROL_FLUSH_HDC;
410
411 /* Flush before emitting STATE_BASE_ADDRESS.
412 *
413 * This isn't documented anywhere in the PRM. However, it seems to be
414 * necessary prior to changing the surface state base address. We've
415 * seen issues in Vulkan where we get GPU hangs when using multi-level
416 * command buffers which clear depth, reset state base address, and then
417 * go render stuff.
418 *
419 * Normally, in GL, we would trust the kernel to do sufficient stalls
420 * and flushes prior to executing our batch. However, it doesn't seem
421 * as if the kernel's flushing is always sufficient and we don't want to
422 * rely on it.
423 *
424 * We make this an end-of-pipe sync instead of a normal flush because we
425 * do not know the current status of the GPU. On Haswell at least,
426 * having a fast-clear operation in flight at the same time as a normal
427 * rendering operation can cause hangs. Since the kernel's flushing is
428 * insufficient, we need to ensure that any rendering operations from
429 * other processes are definitely complete before we try to do our own
430 * rendering. It's a bit of a big hammer but it appears to work.
431 */
432 iris_emit_end_of_pipe_sync(batch,
433 "change STATE_BASE_ADDRESS (flushes)",
434 atsm_compute ? np_state_wa_bits : 0 |
435 PIPE_CONTROL_RENDER_TARGET_FLUSH |
436 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
437 PIPE_CONTROL_DATA_CACHE_FLUSH);
438 }
439
440 static void
flush_after_state_base_change(struct iris_batch * batch)441 flush_after_state_base_change(struct iris_batch *batch)
442 {
443 const struct intel_device_info *devinfo = batch->screen->devinfo;
444 /* After re-setting the surface state base address, we have to do some
445 * cache flusing so that the sampler engine will pick up the new
446 * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
447 * Shared Function > 3D Sampler > State > State Caching (page 96):
448 *
449 * Coherency with system memory in the state cache, like the texture
450 * cache is handled partially by software. It is expected that the
451 * command stream or shader will issue Cache Flush operation or
452 * Cache_Flush sampler message to ensure that the L1 cache remains
453 * coherent with system memory.
454 *
455 * [...]
456 *
457 * Whenever the value of the Dynamic_State_Base_Addr,
458 * Surface_State_Base_Addr are altered, the L1 state cache must be
459 * invalidated to ensure the new surface or sampler state is fetched
460 * from system memory.
461 *
462 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
463 * which, according the PIPE_CONTROL instruction documentation in the
464 * Broadwell PRM:
465 *
466 * Setting this bit is independent of any other bit in this packet.
467 * This bit controls the invalidation of the L1 and L2 state caches
468 * at the top of the pipe i.e. at the parsing time.
469 *
470 * Unfortunately, experimentation seems to indicate that state cache
471 * invalidation through a PIPE_CONTROL does nothing whatsoever in
472 * regards to surface state and binding tables. In stead, it seems that
473 * invalidating the texture cache is what is actually needed.
474 *
475 * XXX: As far as we have been able to determine through
476 * experimentation, shows that flush the texture cache appears to be
477 * sufficient. The theory here is that all of the sampling/rendering
478 * units cache the binding table in the texture cache. However, we have
479 * yet to be able to actually confirm this.
480 *
481 * Wa_16013000631:
482 *
483 * "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
484 * or program pipe control with Instruction cache invalidate post
485 * STATE_BASE_ADDRESS command"
486 */
487 iris_emit_end_of_pipe_sync(batch,
488 "change STATE_BASE_ADDRESS (invalidates)",
489 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
490 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
491 PIPE_CONTROL_STATE_CACHE_INVALIDATE |
492 (intel_needs_workaround(devinfo, 16013000631) ?
493 PIPE_CONTROL_INSTRUCTION_INVALIDATE : 0));
494 }
495
496 static void
iris_load_register_reg32(struct iris_batch * batch,uint32_t dst,uint32_t src)497 iris_load_register_reg32(struct iris_batch *batch, uint32_t dst,
498 uint32_t src)
499 {
500 struct mi_builder b;
501 mi_builder_init(&b, batch->screen->devinfo, batch);
502 mi_store(&b, mi_reg32(dst), mi_reg32(src));
503 }
504
505 static void
iris_load_register_reg64(struct iris_batch * batch,uint32_t dst,uint32_t src)506 iris_load_register_reg64(struct iris_batch *batch, uint32_t dst,
507 uint32_t src)
508 {
509 struct mi_builder b;
510 mi_builder_init(&b, batch->screen->devinfo, batch);
511 mi_store(&b, mi_reg64(dst), mi_reg64(src));
512 }
513
514 static void
iris_load_register_imm32(struct iris_batch * batch,uint32_t reg,uint32_t val)515 iris_load_register_imm32(struct iris_batch *batch, uint32_t reg,
516 uint32_t val)
517 {
518 struct mi_builder b;
519 mi_builder_init(&b, batch->screen->devinfo, batch);
520 mi_store(&b, mi_reg32(reg), mi_imm(val));
521 }
522
523 static void
iris_load_register_imm64(struct iris_batch * batch,uint32_t reg,uint64_t val)524 iris_load_register_imm64(struct iris_batch *batch, uint32_t reg,
525 uint64_t val)
526 {
527 struct mi_builder b;
528 mi_builder_init(&b, batch->screen->devinfo, batch);
529 mi_store(&b, mi_reg64(reg), mi_imm(val));
530 }
531
532 /**
533 * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
534 */
535 static void
iris_load_register_mem32(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset)536 iris_load_register_mem32(struct iris_batch *batch, uint32_t reg,
537 struct iris_bo *bo, uint32_t offset)
538 {
539 iris_batch_sync_region_start(batch);
540 struct mi_builder b;
541 mi_builder_init(&b, batch->screen->devinfo, batch);
542 struct mi_value src = mi_mem32(ro_bo(bo, offset));
543 mi_store(&b, mi_reg32(reg), src);
544 iris_batch_sync_region_end(batch);
545 }
546
547 /**
548 * Load a 64-bit value from a buffer into a MMIO register via
549 * two MI_LOAD_REGISTER_MEM commands.
550 */
551 static void
iris_load_register_mem64(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset)552 iris_load_register_mem64(struct iris_batch *batch, uint32_t reg,
553 struct iris_bo *bo, uint32_t offset)
554 {
555 iris_batch_sync_region_start(batch);
556 struct mi_builder b;
557 mi_builder_init(&b, batch->screen->devinfo, batch);
558 struct mi_value src = mi_mem64(ro_bo(bo, offset));
559 mi_store(&b, mi_reg64(reg), src);
560 iris_batch_sync_region_end(batch);
561 }
562
563 static void
iris_store_register_mem32(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset,bool predicated)564 iris_store_register_mem32(struct iris_batch *batch, uint32_t reg,
565 struct iris_bo *bo, uint32_t offset,
566 bool predicated)
567 {
568 iris_batch_sync_region_start(batch);
569 struct mi_builder b;
570 mi_builder_init(&b, batch->screen->devinfo, batch);
571 struct mi_value dst = mi_mem32(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
572 struct mi_value src = mi_reg32(reg);
573 if (predicated)
574 mi_store_if(&b, dst, src);
575 else
576 mi_store(&b, dst, src);
577 iris_batch_sync_region_end(batch);
578 }
579
580 static void
iris_store_register_mem64(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset,bool predicated)581 iris_store_register_mem64(struct iris_batch *batch, uint32_t reg,
582 struct iris_bo *bo, uint32_t offset,
583 bool predicated)
584 {
585 iris_batch_sync_region_start(batch);
586 struct mi_builder b;
587 mi_builder_init(&b, batch->screen->devinfo, batch);
588 struct mi_value dst = mi_mem64(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
589 struct mi_value src = mi_reg64(reg);
590 if (predicated)
591 mi_store_if(&b, dst, src);
592 else
593 mi_store(&b, dst, src);
594 iris_batch_sync_region_end(batch);
595 }
596
597 static void
iris_store_data_imm32(struct iris_batch * batch,struct iris_bo * bo,uint32_t offset,uint32_t imm)598 iris_store_data_imm32(struct iris_batch *batch,
599 struct iris_bo *bo, uint32_t offset,
600 uint32_t imm)
601 {
602 iris_batch_sync_region_start(batch);
603 struct mi_builder b;
604 mi_builder_init(&b, batch->screen->devinfo, batch);
605 struct mi_value dst = mi_mem32(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
606 struct mi_value src = mi_imm(imm);
607 mi_store(&b, dst, src);
608 iris_batch_sync_region_end(batch);
609 }
610
611 static void
iris_store_data_imm64(struct iris_batch * batch,struct iris_bo * bo,uint32_t offset,uint64_t imm)612 iris_store_data_imm64(struct iris_batch *batch,
613 struct iris_bo *bo, uint32_t offset,
614 uint64_t imm)
615 {
616 iris_batch_sync_region_start(batch);
617 struct mi_builder b;
618 mi_builder_init(&b, batch->screen->devinfo, batch);
619 struct mi_value dst = mi_mem64(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
620 struct mi_value src = mi_imm(imm);
621 mi_store(&b, dst, src);
622 iris_batch_sync_region_end(batch);
623 }
624
625 static void
iris_copy_mem_mem(struct iris_batch * batch,struct iris_bo * dst_bo,uint32_t dst_offset,struct iris_bo * src_bo,uint32_t src_offset,unsigned bytes)626 iris_copy_mem_mem(struct iris_batch *batch,
627 struct iris_bo *dst_bo, uint32_t dst_offset,
628 struct iris_bo *src_bo, uint32_t src_offset,
629 unsigned bytes)
630 {
631 /* MI_COPY_MEM_MEM operates on DWords. */
632 assert(bytes % 4 == 0);
633 assert(dst_offset % 4 == 0);
634 assert(src_offset % 4 == 0);
635 iris_batch_sync_region_start(batch);
636
637 for (unsigned i = 0; i < bytes; i += 4) {
638 iris_emit_cmd(batch, GENX(MI_COPY_MEM_MEM), cp) {
639 cp.DestinationMemoryAddress = rw_bo(dst_bo, dst_offset + i,
640 IRIS_DOMAIN_OTHER_WRITE);
641 cp.SourceMemoryAddress = ro_bo(src_bo, src_offset + i);
642 }
643 }
644
645 iris_batch_sync_region_end(batch);
646 }
647
648 static void
iris_rewrite_compute_walker_pc(struct iris_batch * batch,uint32_t * walker,struct iris_bo * bo,uint32_t offset)649 iris_rewrite_compute_walker_pc(struct iris_batch *batch,
650 uint32_t *walker,
651 struct iris_bo *bo,
652 uint32_t offset)
653 {
654 #if GFX_VERx10 >= 125
655 struct iris_screen *screen = batch->screen;
656 struct iris_address addr = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
657
658 uint32_t dwords[GENX(COMPUTE_WALKER_length)];
659
660 _iris_pack_command(batch, GENX(COMPUTE_WALKER), dwords, cw) {
661 cw.PostSync.Operation = WriteTimestamp;
662 cw.PostSync.DestinationAddress = addr;
663 cw.PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
664 }
665
666 for (uint32_t i = 0; i < GENX(COMPUTE_WALKER_length); i++)
667 walker[i] |= dwords[i];
668 #else
669 unreachable("Unsupported");
670 #endif
671 }
672
673 static void
emit_pipeline_select(struct iris_batch * batch,uint32_t pipeline)674 emit_pipeline_select(struct iris_batch *batch, uint32_t pipeline)
675 {
676 /* Bspec 55860: Xe2+ no longer requires PIPELINE_SELECT */
677 #if GFX_VER < 20
678
679 #if GFX_VER >= 8 && GFX_VER < 10
680 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
681 *
682 * Software must clear the COLOR_CALC_STATE Valid field in
683 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
684 * with Pipeline Select set to GPGPU.
685 *
686 * The internal hardware docs recommend the same workaround for Gfx9
687 * hardware too.
688 */
689 if (pipeline == GPGPU)
690 iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
691 #endif
692
693 #if GFX_VER >= 12
694 /* From Tigerlake PRM, Volume 2a, PIPELINE_SELECT:
695 *
696 * "Software must ensure Render Cache, Depth Cache and HDC Pipeline flush
697 * are flushed through a stalling PIPE_CONTROL command prior to
698 * programming of PIPELINE_SELECT command transitioning Pipeline Select
699 * from 3D to GPGPU/Media.
700 * Software must ensure HDC Pipeline flush and Generic Media State Clear
701 * is issued through a stalling PIPE_CONTROL command prior to programming
702 * of PIPELINE_SELECT command transitioning Pipeline Select from
703 * GPGPU/Media to 3D."
704 *
705 * Note: Issuing PIPE_CONTROL_MEDIA_STATE_CLEAR causes GPU hangs, probably
706 * because PIPE was not in MEDIA mode?!
707 */
708 enum pipe_control_flags flags = PIPE_CONTROL_CS_STALL |
709 PIPE_CONTROL_FLUSH_HDC;
710
711 if (pipeline == GPGPU && batch->name == IRIS_BATCH_RENDER) {
712 flags |= PIPE_CONTROL_RENDER_TARGET_FLUSH |
713 PIPE_CONTROL_DEPTH_CACHE_FLUSH;
714 } else {
715 flags |= PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH;
716 }
717 /* Wa_16013063087 - State Cache Invalidate must be issued prior to
718 * PIPELINE_SELECT when switching from 3D to Compute.
719 *
720 * SW must do this by programming of PIPECONTROL with “CS Stall” followed
721 * by a PIPECONTROL with State Cache Invalidate bit set.
722 */
723 if (pipeline == GPGPU &&
724 intel_needs_workaround(batch->screen->devinfo, 16013063087))
725 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
726
727 iris_emit_pipe_control_flush(batch, "PIPELINE_SELECT flush", flags);
728 #else
729 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
730 * PIPELINE_SELECT [DevBWR+]":
731 *
732 * "Project: DEVSNB+
733 *
734 * Software must ensure all the write caches are flushed through a
735 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
736 * command to invalidate read only caches prior to programming
737 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
738 */
739 iris_emit_pipe_control_flush(batch,
740 "workaround: PIPELINE_SELECT flushes (1/2)",
741 PIPE_CONTROL_RENDER_TARGET_FLUSH |
742 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
743 PIPE_CONTROL_DATA_CACHE_FLUSH |
744 PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
745 PIPE_CONTROL_CS_STALL);
746
747 iris_emit_pipe_control_flush(batch,
748 "workaround: PIPELINE_SELECT flushes (2/2)",
749 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
750 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
751 PIPE_CONTROL_STATE_CACHE_INVALIDATE |
752 PIPE_CONTROL_INSTRUCTION_INVALIDATE);
753 #endif
754
755 iris_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
756 #if GFX_VER >= 9
757 sel.MaskBits = GFX_VER == 12 ? 0x13 : 0x3;
758 #if GFX_VER == 12
759 sel.MediaSamplerDOPClockGateEnable = true;
760 #endif /* if GFX_VER == 12 */
761 #endif /* if GFX_VER >= 9 */
762 sel.PipelineSelection = pipeline;
763 }
764 #endif /* if GFX_VER < 20 */
765 }
766
767 UNUSED static void
init_glk_barrier_mode(struct iris_batch * batch,uint32_t value)768 init_glk_barrier_mode(struct iris_batch *batch, uint32_t value)
769 {
770 #if GFX_VER == 9
771 /* Project: DevGLK
772 *
773 * "This chicken bit works around a hardware issue with barrier
774 * logic encountered when switching between GPGPU and 3D pipelines.
775 * To workaround the issue, this mode bit should be set after a
776 * pipeline is selected."
777 */
778 iris_emit_reg(batch, GENX(SLICE_COMMON_ECO_CHICKEN1), reg) {
779 reg.GLKBarrierMode = value;
780 reg.GLKBarrierModeMask = 1;
781 }
782 #endif
783 }
784
785 static void
init_state_base_address(struct iris_batch * batch)786 init_state_base_address(struct iris_batch *batch)
787 {
788 struct isl_device *isl_dev = &batch->screen->isl_dev;
789 uint32_t mocs = isl_mocs(isl_dev, 0, false);
790 flush_before_state_base_change(batch);
791
792 /* We program most base addresses once at context initialization time.
793 * Each base address points at a 4GB memory zone, and never needs to
794 * change. See iris_bufmgr.h for a description of the memory zones.
795 *
796 * The one exception is Surface State Base Address, which needs to be
797 * updated occasionally. See iris_binder.c for the details there.
798 */
799 iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
800 sba.GeneralStateMOCS = mocs;
801 sba.StatelessDataPortAccessMOCS = mocs;
802 sba.DynamicStateMOCS = mocs;
803 sba.IndirectObjectMOCS = mocs;
804 sba.InstructionMOCS = mocs;
805 sba.SurfaceStateMOCS = mocs;
806 #if GFX_VER >= 9
807 sba.BindlessSurfaceStateMOCS = mocs;
808 #endif
809
810 sba.GeneralStateBaseAddressModifyEnable = true;
811 sba.DynamicStateBaseAddressModifyEnable = true;
812 sba.IndirectObjectBaseAddressModifyEnable = true;
813 sba.InstructionBaseAddressModifyEnable = true;
814 sba.GeneralStateBufferSizeModifyEnable = true;
815 sba.DynamicStateBufferSizeModifyEnable = true;
816 sba.SurfaceStateBaseAddressModifyEnable = true;
817 #if GFX_VER >= 11
818 sba.BindlessSamplerStateMOCS = mocs;
819 #endif
820 sba.IndirectObjectBufferSizeModifyEnable = true;
821 sba.InstructionBuffersizeModifyEnable = true;
822
823 sba.InstructionBaseAddress = ro_bo(NULL, IRIS_MEMZONE_SHADER_START);
824 sba.DynamicStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_DYNAMIC_START);
825 sba.SurfaceStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_BINDER_START);
826
827 sba.GeneralStateBufferSize = 0xfffff;
828 sba.IndirectObjectBufferSize = 0xfffff;
829 sba.InstructionBufferSize = 0xfffff;
830 sba.DynamicStateBufferSize = 0xfffff;
831 #if GFX_VERx10 >= 125
832 sba.L1CacheControl = L1CC_WB;
833 #endif
834 }
835
836 flush_after_state_base_change(batch);
837 }
838
839 static void
iris_emit_l3_config(struct iris_batch * batch,const struct intel_l3_config * cfg)840 iris_emit_l3_config(struct iris_batch *batch,
841 const struct intel_l3_config *cfg)
842 {
843 #if GFX_VER < 20
844 assert(cfg || GFX_VER >= 12);
845
846 #if GFX_VER >= 12
847 #define L3_ALLOCATION_REG GENX(L3ALLOC)
848 #define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)
849 #else
850 #define L3_ALLOCATION_REG GENX(L3CNTLREG)
851 #define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)
852 #endif
853
854 iris_emit_reg(batch, L3_ALLOCATION_REG, reg) {
855 #if GFX_VER < 11
856 reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0;
857 #endif
858 #if GFX_VER == 11
859 /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be set
860 * in L3CNTLREG register. The default setting of the bit is not the
861 * desirable behavior.
862 */
863 reg.ErrorDetectionBehaviorControl = true;
864 reg.UseFullWays = true;
865 #endif
866 if (GFX_VER < 12 || (cfg && cfg->n[INTEL_L3P_ALL] <= 126)) {
867 reg.URBAllocation = cfg->n[INTEL_L3P_URB];
868 reg.ROAllocation = cfg->n[INTEL_L3P_RO];
869 reg.DCAllocation = cfg->n[INTEL_L3P_DC];
870 reg.AllAllocation = cfg->n[INTEL_L3P_ALL];
871 } else {
872 assert(!cfg || !(cfg->n[INTEL_L3P_SLM] || cfg->n[INTEL_L3P_URB] ||
873 cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_RO] ||
874 cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_C] ||
875 cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_TC]));
876 #if GFX_VER >= 12
877 reg.L3FullWayAllocationEnable = true;
878 #endif
879 }
880 }
881 #endif /* GFX_VER < 20 */
882 }
883
884 void
genX(emit_urb_config)885 genX(emit_urb_config)(struct iris_batch *batch,
886 bool has_tess_eval,
887 bool has_geometry)
888 {
889 struct iris_screen *screen = batch->screen;
890 struct iris_context *ice = batch->ice;
891
892 intel_get_urb_config(screen->devinfo,
893 screen->l3_config_3d,
894 has_tess_eval,
895 has_geometry,
896 &ice->shaders.urb.cfg,
897 &ice->state.urb_deref_block_size,
898 &ice->shaders.urb.constrained);
899
900 genX(urb_workaround)(batch, &ice->shaders.urb.cfg);
901
902 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
903 iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
904 urb._3DCommandSubOpcode += i;
905 urb.VSURBStartingAddress = ice->shaders.urb.cfg.start[i];
906 urb.VSURBEntryAllocationSize = ice->shaders.urb.cfg.size[i] - 1;
907 urb.VSNumberofURBEntries = ice->shaders.urb.cfg.entries[i];
908 }
909 }
910 }
911
912 #if GFX_VER == 9
913 static void
iris_enable_obj_preemption(struct iris_batch * batch,bool enable)914 iris_enable_obj_preemption(struct iris_batch *batch, bool enable)
915 {
916 /* A fixed function pipe flush is required before modifying this field */
917 iris_emit_end_of_pipe_sync(batch, enable ? "enable preemption"
918 : "disable preemption",
919 PIPE_CONTROL_RENDER_TARGET_FLUSH);
920
921 /* enable object level preemption */
922 iris_emit_reg(batch, GENX(CS_CHICKEN1), reg) {
923 reg.ReplayMode = enable;
924 reg.ReplayModeMask = true;
925 }
926 }
927 #endif
928
929 static void
upload_pixel_hashing_tables(struct iris_batch * batch)930 upload_pixel_hashing_tables(struct iris_batch *batch)
931 {
932 UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
933 UNUSED struct iris_context *ice = batch->ice;
934 assert(&ice->batches[IRIS_BATCH_RENDER] == batch);
935
936 #if GFX_VER == 11
937 /* Gfx11 hardware has two pixel pipes at most. */
938 for (unsigned i = 2; i < ARRAY_SIZE(devinfo->ppipe_subslices); i++)
939 assert(devinfo->ppipe_subslices[i] == 0);
940
941 if (devinfo->ppipe_subslices[0] == devinfo->ppipe_subslices[1])
942 return;
943
944 unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
945 uint32_t hash_address;
946 struct pipe_resource *tmp = NULL;
947 uint32_t *map =
948 stream_state(batch, ice->state.dynamic_uploader, &tmp,
949 size, 64, &hash_address);
950 pipe_resource_reference(&tmp, NULL);
951
952 const bool flip = devinfo->ppipe_subslices[0] < devinfo->ppipe_subslices[1];
953 struct GENX(SLICE_HASH_TABLE) table;
954 intel_compute_pixel_hash_table_3way(16, 16, 3, 3, flip, table.Entry[0]);
955
956 GENX(SLICE_HASH_TABLE_pack)(NULL, map, &table);
957
958 iris_emit_cmd(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
959 ptr.SliceHashStatePointerValid = true;
960 ptr.SliceHashTableStatePointer = hash_address;
961 }
962
963 iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {
964 mode.SliceHashingTableEnable = true;
965 }
966
967 #elif GFX_VERx10 == 120
968 /* For each n calculate ppipes_of[n], equal to the number of pixel pipes
969 * present with n active dual subslices.
970 */
971 unsigned ppipes_of[3] = {};
972
973 for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {
974 for (unsigned p = 0; p < 3; p++)
975 ppipes_of[n] += (devinfo->ppipe_subslices[p] == n);
976 }
977
978 /* Gfx12 has three pixel pipes. */
979 for (unsigned p = 3; p < ARRAY_SIZE(devinfo->ppipe_subslices); p++)
980 assert(devinfo->ppipe_subslices[p] == 0);
981
982 if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {
983 /* All three pixel pipes have the maximum number of active dual
984 * subslices, or there is only one active pixel pipe: Nothing to do.
985 */
986 return;
987 }
988
989 iris_emit_cmd(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {
990 p.SliceHashControl[0] = TABLE_0;
991
992 if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
993 intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);
994 else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
995 intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);
996
997 if (ppipes_of[2] == 2 && ppipes_of[1] == 1)
998 intel_compute_pixel_hash_table_3way(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);
999 else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
1000 intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);
1001 else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
1002 intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);
1003 else
1004 unreachable("Illegal fusing.");
1005 }
1006
1007 iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), p) {
1008 p.SubsliceHashingTableEnable = true;
1009 p.SubsliceHashingTableEnableMask = true;
1010 }
1011
1012 #elif GFX_VERx10 == 125
1013 struct pipe_screen *pscreen = &batch->screen->base;
1014 const unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
1015 const struct pipe_resource tmpl = {
1016 .target = PIPE_BUFFER,
1017 .format = PIPE_FORMAT_R8_UNORM,
1018 .bind = PIPE_BIND_CUSTOM,
1019 .usage = PIPE_USAGE_IMMUTABLE,
1020 .flags = IRIS_RESOURCE_FLAG_DYNAMIC_MEMZONE,
1021 .width0 = size,
1022 .height0 = 1,
1023 .depth0 = 1,
1024 .array_size = 1
1025 };
1026
1027 pipe_resource_reference(&ice->state.pixel_hashing_tables, NULL);
1028 ice->state.pixel_hashing_tables = pscreen->resource_create(pscreen, &tmpl);
1029
1030 struct iris_resource *res = (struct iris_resource *)ice->state.pixel_hashing_tables;
1031 struct pipe_transfer *transfer = NULL;
1032 uint32_t *map = pipe_buffer_map_range(&ice->ctx, ice->state.pixel_hashing_tables,
1033 0, size, PIPE_MAP_WRITE,
1034 &transfer);
1035
1036 /* Calculate the set of present pixel pipes, and another set of
1037 * present pixel pipes with 2 dual subslices enabled, the latter
1038 * will appear on the hashing table with twice the frequency of
1039 * pixel pipes with a single dual subslice present.
1040 */
1041 uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0;
1042 for (unsigned p = 0; p < ARRAY_SIZE(devinfo->ppipe_subslices); p++) {
1043 if (devinfo->ppipe_subslices[p])
1044 ppipe_mask1 |= (1u << p);
1045 if (devinfo->ppipe_subslices[p] > 1)
1046 ppipe_mask2 |= (1u << p);
1047 }
1048 assert(ppipe_mask1);
1049
1050 struct GENX(SLICE_HASH_TABLE) table;
1051
1052 /* Note that the hardware expects an array with 7 tables, each
1053 * table is intended to specify the pixel pipe hashing behavior for
1054 * every possible slice count between 2 and 8, however that doesn't
1055 * actually work, among other reasons due to hardware bugs that
1056 * will cause the GPU to erroneously access the table at the wrong
1057 * index in some cases, so in practice all 7 tables need to be
1058 * initialized to the same value.
1059 */
1060 for (unsigned i = 0; i < 7; i++)
1061 intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2,
1062 table.Entry[i][0]);
1063
1064 GENX(SLICE_HASH_TABLE_pack)(NULL, map, &table);
1065
1066 pipe_buffer_unmap(&ice->ctx, transfer);
1067
1068 iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_NONE);
1069 iris_record_state_size(batch->state_sizes, res->bo->address + res->offset, size);
1070
1071 iris_emit_cmd(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
1072 ptr.SliceHashStatePointerValid = true;
1073 ptr.SliceHashTableStatePointer = iris_bo_offset_from_base_address(res->bo) +
1074 res->offset;
1075 }
1076
1077 iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {
1078 mode.SliceHashingTableEnable = true;
1079 mode.SliceHashingTableEnableMask = true;
1080 mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ?
1081 hashing32x32 : NormalMode);
1082 mode.CrossSliceHashingModeMask = -1;
1083 }
1084 #endif
1085 }
1086
1087 static void
iris_alloc_push_constants(struct iris_batch * batch)1088 iris_alloc_push_constants(struct iris_batch *batch)
1089 {
1090 const struct intel_device_info *devinfo = batch->screen->devinfo;
1091
1092 /* For now, we set a static partitioning of the push constant area,
1093 * assuming that all stages could be in use.
1094 *
1095 * TODO: Try lazily allocating the HS/DS/GS sections as needed, and
1096 * see if that improves performance by offering more space to
1097 * the VS/FS when those aren't in use. Also, try dynamically
1098 * enabling/disabling it like i965 does. This would be more
1099 * stalls and may not actually help; we don't know yet.
1100 */
1101
1102 /* Divide as equally as possible with any remainder given to FRAGMENT. */
1103 const unsigned push_constant_kb = devinfo->max_constant_urb_size_kb;
1104 const unsigned stage_size = push_constant_kb / 5;
1105 const unsigned frag_size = push_constant_kb - 4 * stage_size;
1106
1107 for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
1108 iris_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
1109 alloc._3DCommandSubOpcode = 18 + i;
1110 alloc.ConstantBufferOffset = stage_size * i;
1111 alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? frag_size : stage_size;
1112 }
1113 }
1114
1115 #if GFX_VERx10 == 125
1116 /* DG2: Wa_22011440098
1117 * MTL: Wa_18022330953
1118 *
1119 * In 3D mode, after programming push constant alloc command immediately
1120 * program push constant command(ZERO length) without any commit between
1121 * them.
1122 */
1123 iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_ALL), c) {
1124 /* Update empty push constants for all stages (bitmask = 11111b) */
1125 c.ShaderUpdateEnable = 0x1f;
1126 c.MOCS = iris_mocs(NULL, &batch->screen->isl_dev, 0);
1127 }
1128 #endif
1129 }
1130
1131 #if GFX_VER >= 12
1132 static void
1133 init_aux_map_state(struct iris_batch *batch);
1134 #endif
1135
1136 /* This updates a register. Caller should stall the pipeline as needed. */
1137 static void
iris_disable_rhwo_optimization(struct iris_batch * batch,bool disable)1138 iris_disable_rhwo_optimization(struct iris_batch *batch, bool disable)
1139 {
1140 assert(batch->screen->devinfo->verx10 == 120);
1141 #if GFX_VERx10 == 120
1142 iris_emit_reg(batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
1143 c1.RCCRHWOOptimizationDisable = disable;
1144 c1.RCCRHWOOptimizationDisableMask = true;
1145 };
1146 #endif
1147 }
1148
1149 /**
1150 * Upload initial GPU state for any kind of context.
1151 *
1152 * These need to happen for both render and compute.
1153 */
1154 static void
iris_init_common_context(struct iris_batch * batch)1155 iris_init_common_context(struct iris_batch *batch)
1156 {
1157 #if GFX_VER == 11
1158 iris_emit_reg(batch, GENX(SAMPLER_MODE), reg) {
1159 reg.HeaderlessMessageforPreemptableContexts = 1;
1160 reg.HeaderlessMessageforPreemptableContextsMask = 1;
1161 }
1162
1163 /* Bit 1 must be set in HALF_SLICE_CHICKEN7. */
1164 iris_emit_reg(batch, GENX(HALF_SLICE_CHICKEN7), reg) {
1165 reg.EnabledTexelOffsetPrecisionFix = 1;
1166 reg.EnabledTexelOffsetPrecisionFixMask = 1;
1167 }
1168 #endif
1169
1170 /* Select 256B-aligned binding table mode on Icelake through Tigerlake,
1171 * which gives us larger binding table pointers, at the cost of higher
1172 * alignment requirements (bits 18:8 are valid instead of 15:5). When
1173 * using this mode, we have to shift binding table pointers by 3 bits,
1174 * as they're still stored in the same bit-location in the field.
1175 */
1176 #if GFX_VER >= 11 && GFX_VERx10 < 125
1177 iris_emit_reg(batch, GENX(GT_MODE), reg) {
1178 reg.BindingTableAlignment = BTP_18_8;
1179 reg.BindingTableAlignmentMask = true;
1180 }
1181 #endif
1182
1183 #if GFX_VERx10 == 125
1184 /* Even though L3 partial write merging is supposed to be enabled
1185 * by default on Gfx12.5 according to the hardware spec, i915
1186 * appears to accidentally clear the enables during context
1187 * initialization, so make sure to enable them here since partial
1188 * write merging has a large impact on rendering performance.
1189 */
1190 iris_emit_reg(batch, GENX(L3SQCREG5), reg) {
1191 reg.L3CachePartialWriteMergeTimerInitialValue = 0x7f;
1192 reg.CompressiblePartialWriteMergeEnable = true;
1193 reg.CoherentPartialWriteMergeEnable = true;
1194 reg.CrossTilePartialWriteMergeEnable = true;
1195 }
1196 #endif
1197 }
1198
1199 static void
toggle_protected(struct iris_batch * batch)1200 toggle_protected(struct iris_batch *batch)
1201 {
1202 struct iris_context *ice;
1203
1204 if (batch->name == IRIS_BATCH_RENDER)
1205 ice =container_of(batch, struct iris_context, batches[IRIS_BATCH_RENDER]);
1206 else if (batch->name == IRIS_BATCH_COMPUTE)
1207 ice = container_of(batch, struct iris_context, batches[IRIS_BATCH_COMPUTE]);
1208 else
1209 unreachable("unhandled batch");
1210
1211 if (!ice->protected)
1212 return;
1213
1214 #if GFX_VER >= 12
1215 iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
1216 pc.CommandStreamerStallEnable = true;
1217 pc.RenderTargetCacheFlushEnable = true;
1218 pc.ProtectedMemoryDisable = true;
1219 }
1220 iris_emit_cmd(batch, GENX(MI_SET_APPID), appid) {
1221 /* Default value for single session. */
1222 appid.ProtectedMemoryApplicationID = 0xf;
1223 appid.ProtectedMemoryApplicationIDType = DISPLAY_APP;
1224 }
1225 iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
1226 pc.CommandStreamerStallEnable = true;
1227 pc.RenderTargetCacheFlushEnable = true;
1228 pc.ProtectedMemoryEnable = true;
1229 }
1230 #else
1231 unreachable("Not supported");
1232 #endif
1233 }
1234
1235 #if GFX_VER >= 20
1236 #define _3DSTATE_DRAWING_RECTANGLE GENX(3DSTATE_DRAWING_RECTANGLE_FAST)
1237 #else
1238 #define _3DSTATE_DRAWING_RECTANGLE GENX(3DSTATE_DRAWING_RECTANGLE)
1239 #endif
1240
1241 /**
1242 * Upload the initial GPU state for a render context.
1243 *
1244 * This sets some invariant state that needs to be programmed a particular
1245 * way, but we never actually change.
1246 */
1247 static void
iris_init_render_context(struct iris_batch * batch)1248 iris_init_render_context(struct iris_batch *batch)
1249 {
1250 UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
1251
1252 iris_batch_sync_region_start(batch);
1253
1254 emit_pipeline_select(batch, _3D);
1255
1256 toggle_protected(batch);
1257
1258 iris_emit_l3_config(batch, batch->screen->l3_config_3d);
1259
1260 init_state_base_address(batch);
1261
1262 iris_init_common_context(batch);
1263
1264 #if GFX_VER >= 9
1265 iris_emit_reg(batch, GENX(CS_DEBUG_MODE2), reg) {
1266 reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1267 reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1268 }
1269 #else
1270 iris_emit_reg(batch, GENX(INSTPM), reg) {
1271 reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1272 reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1273 }
1274 #endif
1275
1276 #if GFX_VER == 9
1277 iris_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
1278 reg.FloatBlendOptimizationEnable = true;
1279 reg.FloatBlendOptimizationEnableMask = true;
1280 reg.MSCRAWHazardAvoidanceBit = true;
1281 reg.MSCRAWHazardAvoidanceBitMask = true;
1282 reg.PartialResolveDisableInVC = true;
1283 reg.PartialResolveDisableInVCMask = true;
1284 }
1285
1286 if (devinfo->platform == INTEL_PLATFORM_GLK)
1287 init_glk_barrier_mode(batch, GLK_BARRIER_MODE_3D_HULL);
1288 #endif
1289
1290 #if GFX_VER == 11
1291 iris_emit_reg(batch, GENX(TCCNTLREG), reg) {
1292 reg.L3DataPartialWriteMergingEnable = true;
1293 reg.ColorZPartialWriteMergingEnable = true;
1294 reg.URBPartialWriteMergingEnable = true;
1295 reg.TCDisable = true;
1296 }
1297
1298 /* Hardware specification recommends disabling repacking for the
1299 * compatibility with decompression mechanism in display controller.
1300 */
1301 if (devinfo->disable_ccs_repack) {
1302 iris_emit_reg(batch, GENX(CACHE_MODE_0), reg) {
1303 reg.DisableRepackingforCompression = true;
1304 reg.DisableRepackingforCompressionMask = true;
1305 }
1306 }
1307 #endif
1308
1309 #if GFX_VER == 12
1310 iris_emit_reg(batch, GENX(FF_MODE2), reg) {
1311 /* On Alchemist, the FF_MODE2 docs for the GS timer say:
1312 *
1313 * "The timer value must be set to 224."
1314 *
1315 * and Wa_16011163337 indicates this is the case for all Gfx12 parts,
1316 * and that this is necessary to avoid hanging the HS/DS units. It
1317 * also clarifies that 224 is literally 0xE0 in the bits, not 7*32=224.
1318 *
1319 * The HS timer docs also have the same quote for Alchemist. I am
1320 * unaware of a reason it needs to be set to 224 on Tigerlake, but
1321 * we do so for consistency if nothing else.
1322 *
1323 * For the TDS timer value, the docs say:
1324 *
1325 * "For best performance, a value of 4 should be programmed."
1326 *
1327 * i915 also sets it this way on Tigerlake due to workarounds.
1328 *
1329 * The default VS timer appears to be 0, so we leave it at that.
1330 */
1331 reg.GSTimerValue = 224;
1332 reg.HSTimerValue = 224;
1333 reg.TDSTimerValue = 4;
1334 reg.VSTimerValue = 0;
1335 }
1336 #endif
1337
1338 #if INTEL_NEEDS_WA_1508744258
1339 /* The suggested workaround is:
1340 *
1341 * Disable RHWO by setting 0x7010[14] by default except during resolve
1342 * pass.
1343 *
1344 * We implement global disabling of the optimization here and we toggle it
1345 * in iris_resolve_color.
1346 *
1347 * iris_init_compute_context is unmodified because we don't expect to
1348 * access the RCC in the compute context. iris_mcs_partial_resolve is
1349 * unmodified because that pass doesn't use a HW bit to perform the
1350 * resolve (related HSDs specifically call out the RenderTargetResolveType
1351 * field in the 3DSTATE_PS instruction).
1352 */
1353 iris_disable_rhwo_optimization(batch, true);
1354 #endif
1355
1356 #if GFX_VERx10 == 120
1357 /* Wa_1806527549 says to disable the following HiZ optimization when the
1358 * depth buffer is D16_UNORM. We've found the WA to help with more depth
1359 * buffer configurations however, so we always disable it just to be safe.
1360 */
1361 iris_emit_reg(batch, GENX(HIZ_CHICKEN), reg) {
1362 reg.HZDepthTestLEGEOptimizationDisable = true;
1363 reg.HZDepthTestLEGEOptimizationDisableMask = true;
1364 }
1365 #endif
1366
1367 #if GFX_VERx10 == 125
1368 iris_emit_reg(batch, GENX(CHICKEN_RASTER_2), reg) {
1369 reg.TBIMRBatchSizeOverride = true;
1370 reg.TBIMROpenBatchEnable = true;
1371 reg.TBIMRFastClip = true;
1372 reg.TBIMRBatchSizeOverrideMask = true;
1373 reg.TBIMROpenBatchEnableMask = true;
1374 reg.TBIMRFastClipMask = true;
1375 };
1376 #endif
1377
1378 #if GFX_VER >= 20
1379 iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), p) {
1380 p.DX10OGLBorderModeforYCRCB = true;
1381 p.DX10OGLBorderModeforYCRCBMask = true;
1382 }
1383 #endif
1384
1385 upload_pixel_hashing_tables(batch);
1386
1387 /* 3DSTATE_DRAWING_RECTANGLE is non-pipelined, so we want to avoid
1388 * changing it dynamically. We set it to the maximum size here, and
1389 * instead include the render target dimensions in the viewport, so
1390 * viewport extents clipping takes care of pruning stray geometry.
1391 */
1392 iris_emit_cmd(batch, _3DSTATE_DRAWING_RECTANGLE, rect) {
1393 rect.ClippedDrawingRectangleXMax = UINT16_MAX;
1394 rect.ClippedDrawingRectangleYMax = UINT16_MAX;
1395 }
1396
1397 /* Set the initial MSAA sample positions. */
1398 iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) {
1399 INTEL_SAMPLE_POS_1X(pat._1xSample);
1400 INTEL_SAMPLE_POS_2X(pat._2xSample);
1401 INTEL_SAMPLE_POS_4X(pat._4xSample);
1402 INTEL_SAMPLE_POS_8X(pat._8xSample);
1403 #if GFX_VER >= 9
1404 INTEL_SAMPLE_POS_16X(pat._16xSample);
1405 #endif
1406 }
1407
1408 /* Use the legacy AA line coverage computation. */
1409 iris_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
1410
1411 /* Disable chromakeying (it's for media) */
1412 iris_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo);
1413
1414 /* We want regular rendering, not special HiZ operations. */
1415 iris_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);
1416
1417 /* No polygon stippling offsets are necessary. */
1418 /* TODO: may need to set an offset for origin-UL framebuffers */
1419 iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
1420
1421 #if GFX_VERx10 >= 125
1422 iris_emit_cmd(batch, GENX(3DSTATE_MESH_CONTROL), foo);
1423 iris_emit_cmd(batch, GENX(3DSTATE_TASK_CONTROL), foo);
1424 #endif
1425
1426 #if INTEL_NEEDS_WA_14019857787
1427 iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), p) {
1428 p.EnableOOOreadsinRCPB = true;
1429 p.EnableOOOreadsinRCPBMask = true;
1430 }
1431 #endif
1432
1433 iris_alloc_push_constants(batch);
1434
1435 #if GFX_VER >= 12
1436 init_aux_map_state(batch);
1437 #endif
1438
1439 iris_batch_sync_region_end(batch);
1440 }
1441
1442 static void
iris_init_compute_context(struct iris_batch * batch)1443 iris_init_compute_context(struct iris_batch *batch)
1444 {
1445 UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
1446
1447 iris_batch_sync_region_start(batch);
1448
1449 /* Wa_1607854226:
1450 *
1451 * Start with pipeline in 3D mode to set the STATE_BASE_ADDRESS.
1452 */
1453 #if GFX_VERx10 == 120
1454 emit_pipeline_select(batch, _3D);
1455 #else
1456 emit_pipeline_select(batch, GPGPU);
1457 #endif
1458
1459 toggle_protected(batch);
1460
1461 iris_emit_l3_config(batch, batch->screen->l3_config_cs);
1462
1463 init_state_base_address(batch);
1464
1465 iris_init_common_context(batch);
1466
1467 #if GFX_VERx10 == 120
1468 emit_pipeline_select(batch, GPGPU);
1469 #endif
1470
1471 #if GFX_VER == 9
1472 if (devinfo->platform == INTEL_PLATFORM_GLK)
1473 init_glk_barrier_mode(batch, GLK_BARRIER_MODE_GPGPU);
1474 #endif
1475
1476 #if GFX_VER >= 12
1477 init_aux_map_state(batch);
1478 #endif
1479
1480 #if GFX_VERx10 >= 125
1481 iris_emit_cmd(batch, GENX(CFE_STATE), cfe) {
1482 cfe.MaximumNumberofThreads =
1483 devinfo->max_cs_threads * devinfo->subslice_total;
1484 }
1485 #endif
1486
1487 iris_batch_sync_region_end(batch);
1488 }
1489
1490 static void
iris_init_copy_context(struct iris_batch * batch)1491 iris_init_copy_context(struct iris_batch *batch)
1492 {
1493 iris_batch_sync_region_start(batch);
1494
1495 #if GFX_VER >= 12
1496 init_aux_map_state(batch);
1497 #endif
1498
1499 iris_batch_sync_region_end(batch);
1500 }
1501
1502 struct iris_vertex_buffer_state {
1503 /** The VERTEX_BUFFER_STATE hardware structure. */
1504 uint32_t state[GENX(VERTEX_BUFFER_STATE_length)];
1505
1506 /** The resource to source vertex data from. */
1507 struct pipe_resource *resource;
1508
1509 int offset;
1510 };
1511
1512 struct iris_depth_buffer_state {
1513 /* Depth/HiZ/Stencil related hardware packets. */
1514 #if GFX_VER < 20
1515 uint32_t packets[GENX(3DSTATE_DEPTH_BUFFER_length) +
1516 GENX(3DSTATE_STENCIL_BUFFER_length) +
1517 GENX(3DSTATE_HIER_DEPTH_BUFFER_length) +
1518 GENX(3DSTATE_CLEAR_PARAMS_length)];
1519 #else
1520 uint32_t packets[GENX(3DSTATE_DEPTH_BUFFER_length) +
1521 GENX(3DSTATE_STENCIL_BUFFER_length) +
1522 GENX(3DSTATE_HIER_DEPTH_BUFFER_length)];
1523 #endif
1524 };
1525
1526 #if INTEL_NEEDS_WA_1808121037
1527 enum iris_depth_reg_mode {
1528 IRIS_DEPTH_REG_MODE_HW_DEFAULT = 0,
1529 IRIS_DEPTH_REG_MODE_D16_1X_MSAA,
1530 IRIS_DEPTH_REG_MODE_UNKNOWN,
1531 };
1532 #endif
1533
1534 /**
1535 * Generation-specific context state (ice->state.genx->...).
1536 *
1537 * Most state can go in iris_context directly, but these encode hardware
1538 * packets which vary by generation.
1539 */
1540 struct iris_genx_state {
1541 struct iris_vertex_buffer_state vertex_buffers[33];
1542 uint32_t last_index_buffer[GENX(3DSTATE_INDEX_BUFFER_length)];
1543
1544 struct iris_depth_buffer_state depth_buffer;
1545
1546 uint32_t so_buffers[4 * GENX(3DSTATE_SO_BUFFER_length)];
1547
1548 #if GFX_VER == 8
1549 bool pma_fix_enabled;
1550 #endif
1551
1552 /* Is object level preemption enabled? */
1553 bool object_preemption;
1554
1555 #if INTEL_NEEDS_WA_1808121037
1556 enum iris_depth_reg_mode depth_reg_mode;
1557 #endif
1558
1559 struct {
1560 #if GFX_VER == 8
1561 struct isl_image_param image_param[PIPE_MAX_SHADER_IMAGES];
1562 #endif
1563 } shaders[MESA_SHADER_STAGES];
1564 };
1565
1566 /**
1567 * The pipe->set_blend_color() driver hook.
1568 *
1569 * This corresponds to our COLOR_CALC_STATE.
1570 */
1571 static void
iris_set_blend_color(struct pipe_context * ctx,const struct pipe_blend_color * state)1572 iris_set_blend_color(struct pipe_context *ctx,
1573 const struct pipe_blend_color *state)
1574 {
1575 struct iris_context *ice = (struct iris_context *) ctx;
1576
1577 /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
1578 memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
1579 ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
1580 }
1581
1582 /**
1583 * Gallium CSO for blend state (see pipe_blend_state).
1584 */
1585 struct iris_blend_state {
1586 /** Partial 3DSTATE_PS_BLEND */
1587 uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];
1588
1589 /** Partial BLEND_STATE */
1590 uint32_t blend_state[GENX(BLEND_STATE_length) +
1591 IRIS_MAX_DRAW_BUFFERS * GENX(BLEND_STATE_ENTRY_length)];
1592
1593 bool alpha_to_coverage; /* for shader key */
1594
1595 /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
1596 uint8_t blend_enables;
1597
1598 /** Bitfield of whether color writes are enabled for RT[i] */
1599 uint8_t color_write_enables;
1600
1601 /** Does RT[0] use dual color blending? */
1602 bool dual_color_blending;
1603
1604 int ps_dst_blend_factor[IRIS_MAX_DRAW_BUFFERS];
1605 int ps_dst_alpha_blend_factor[IRIS_MAX_DRAW_BUFFERS];
1606 };
1607
1608 static enum pipe_blendfactor
fix_blendfactor(enum pipe_blendfactor f,bool alpha_to_one)1609 fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
1610 {
1611 if (alpha_to_one) {
1612 if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
1613 return PIPE_BLENDFACTOR_ONE;
1614
1615 if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
1616 return PIPE_BLENDFACTOR_ZERO;
1617 }
1618
1619 return f;
1620 }
1621
1622 /**
1623 * The pipe->create_blend_state() driver hook.
1624 *
1625 * Translates a pipe_blend_state into iris_blend_state.
1626 */
1627 static void *
iris_create_blend_state(struct pipe_context * ctx,const struct pipe_blend_state * state)1628 iris_create_blend_state(struct pipe_context *ctx,
1629 const struct pipe_blend_state *state)
1630 {
1631 struct iris_blend_state *cso = malloc(sizeof(struct iris_blend_state));
1632 uint32_t *blend_entry = cso->blend_state + GENX(BLEND_STATE_length);
1633
1634 cso->blend_enables = 0;
1635 cso->color_write_enables = 0;
1636 STATIC_ASSERT(IRIS_MAX_DRAW_BUFFERS <= 8);
1637
1638 cso->alpha_to_coverage = state->alpha_to_coverage;
1639
1640 bool indep_alpha_blend = false;
1641
1642 for (int i = 0; i < IRIS_MAX_DRAW_BUFFERS; i++) {
1643 const struct pipe_rt_blend_state *rt =
1644 &state->rt[state->independent_blend_enable ? i : 0];
1645
1646 enum pipe_blendfactor src_rgb =
1647 fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one);
1648 enum pipe_blendfactor src_alpha =
1649 fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one);
1650 enum pipe_blendfactor dst_rgb =
1651 fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one);
1652 enum pipe_blendfactor dst_alpha =
1653 fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one);
1654
1655 /* Stored separately in cso for dynamic emission. */
1656 cso->ps_dst_blend_factor[i] = (int) dst_rgb;
1657 cso->ps_dst_alpha_blend_factor[i] = (int) dst_alpha;
1658
1659 if (rt->rgb_func != rt->alpha_func ||
1660 src_rgb != src_alpha || dst_rgb != dst_alpha)
1661 indep_alpha_blend = true;
1662
1663 if (rt->blend_enable)
1664 cso->blend_enables |= 1u << i;
1665
1666 if (rt->colormask)
1667 cso->color_write_enables |= 1u << i;
1668
1669 iris_pack_state(GENX(BLEND_STATE_ENTRY), blend_entry, be) {
1670 be.LogicOpEnable = state->logicop_enable;
1671 be.LogicOpFunction = state->logicop_func;
1672
1673 be.PreBlendSourceOnlyClampEnable = false;
1674 be.ColorClampRange = COLORCLAMP_RTFORMAT;
1675 be.PreBlendColorClampEnable = true;
1676 be.PostBlendColorClampEnable = true;
1677
1678 be.ColorBufferBlendEnable = rt->blend_enable;
1679
1680 be.ColorBlendFunction = rt->rgb_func;
1681 be.AlphaBlendFunction = rt->alpha_func;
1682
1683 /* The casts prevent warnings about implicit enum type conversions. */
1684 be.SourceBlendFactor = (int) src_rgb;
1685 be.SourceAlphaBlendFactor = (int) src_alpha;
1686
1687 be.WriteDisableRed = !(rt->colormask & PIPE_MASK_R);
1688 be.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
1689 be.WriteDisableBlue = !(rt->colormask & PIPE_MASK_B);
1690 be.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
1691 }
1692 blend_entry += GENX(BLEND_STATE_ENTRY_length);
1693 }
1694
1695 iris_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) {
1696 /* pb.HasWriteableRT is filled in at draw time.
1697 * pb.AlphaTestEnable is filled in at draw time.
1698 *
1699 * pb.ColorBufferBlendEnable is filled in at draw time so we can avoid
1700 * setting it when dual color blending without an appropriate shader.
1701 */
1702
1703 pb.AlphaToCoverageEnable = state->alpha_to_coverage;
1704 pb.IndependentAlphaBlendEnable = indep_alpha_blend;
1705
1706 /* The casts prevent warnings about implicit enum type conversions. */
1707 pb.SourceBlendFactor =
1708 (int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one);
1709 pb.SourceAlphaBlendFactor =
1710 (int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one);
1711 }
1712
1713 iris_pack_state(GENX(BLEND_STATE), cso->blend_state, bs) {
1714 bs.AlphaToCoverageEnable = state->alpha_to_coverage;
1715 bs.IndependentAlphaBlendEnable = indep_alpha_blend;
1716 bs.AlphaToOneEnable = state->alpha_to_one;
1717 bs.AlphaToCoverageDitherEnable = state->alpha_to_coverage_dither;
1718 bs.ColorDitherEnable = state->dither;
1719 /* bl.AlphaTestEnable and bs.AlphaTestFunction are filled in later. */
1720 }
1721
1722 cso->dual_color_blending = util_blend_state_is_dual(state, 0);
1723
1724 return cso;
1725 }
1726
1727 /**
1728 * The pipe->bind_blend_state() driver hook.
1729 *
1730 * Bind a blending CSO and flag related dirty bits.
1731 */
1732 static void
iris_bind_blend_state(struct pipe_context * ctx,void * state)1733 iris_bind_blend_state(struct pipe_context *ctx, void *state)
1734 {
1735 struct iris_context *ice = (struct iris_context *) ctx;
1736 struct iris_blend_state *cso = state;
1737
1738 ice->state.cso_blend = cso;
1739
1740 ice->state.dirty |= IRIS_DIRTY_PS_BLEND;
1741 ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
1742 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[IRIS_NOS_BLEND];
1743
1744 if (GFX_VER == 8)
1745 ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
1746 }
1747
1748 /**
1749 * Return true if the FS writes to any color outputs which are not disabled
1750 * via color masking.
1751 */
1752 static bool
has_writeable_rt(const struct iris_blend_state * cso_blend,const struct shader_info * fs_info)1753 has_writeable_rt(const struct iris_blend_state *cso_blend,
1754 const struct shader_info *fs_info)
1755 {
1756 if (!fs_info)
1757 return false;
1758
1759 unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
1760
1761 if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
1762 rt_outputs = (1 << IRIS_MAX_DRAW_BUFFERS) - 1;
1763
1764 return cso_blend->color_write_enables & rt_outputs;
1765 }
1766
1767 /**
1768 * Gallium CSO for depth, stencil, and alpha testing state.
1769 */
1770 struct iris_depth_stencil_alpha_state {
1771 /** Partial 3DSTATE_WM_DEPTH_STENCIL. */
1772 uint32_t wmds[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
1773
1774 #if GFX_VER >= 12
1775 uint32_t depth_bounds[GENX(3DSTATE_DEPTH_BOUNDS_length)];
1776 #endif
1777
1778 /** Outbound to BLEND_STATE, 3DSTATE_PS_BLEND, COLOR_CALC_STATE. */
1779 unsigned alpha_enabled:1;
1780 unsigned alpha_func:3; /**< PIPE_FUNC_x */
1781 float alpha_ref_value; /**< reference value */
1782
1783 /** Outbound to resolve and cache set tracking. */
1784 bool depth_writes_enabled;
1785 bool stencil_writes_enabled;
1786
1787 /** Outbound to Gfx8-9 PMA stall equations */
1788 bool depth_test_enabled;
1789
1790 /** Tracking state of DS writes for Wa_18019816803. */
1791 bool ds_write_state;
1792 };
1793
1794 /**
1795 * The pipe->create_depth_stencil_alpha_state() driver hook.
1796 *
1797 * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
1798 * testing state since we need pieces of it in a variety of places.
1799 */
1800 static void *
iris_create_zsa_state(struct pipe_context * ctx,const struct pipe_depth_stencil_alpha_state * state)1801 iris_create_zsa_state(struct pipe_context *ctx,
1802 const struct pipe_depth_stencil_alpha_state *state)
1803 {
1804 struct iris_depth_stencil_alpha_state *cso =
1805 malloc(sizeof(struct iris_depth_stencil_alpha_state));
1806
1807 bool two_sided_stencil = state->stencil[1].enabled;
1808
1809 bool depth_write_enabled = false;
1810 bool stencil_write_enabled = false;
1811
1812 /* Depth writes enabled? */
1813 if (state->depth_writemask &&
1814 ((!state->depth_enabled) ||
1815 ((state->depth_func != PIPE_FUNC_NEVER) &&
1816 (state->depth_func != PIPE_FUNC_EQUAL))))
1817 depth_write_enabled = true;
1818
1819 bool stencil_all_keep =
1820 state->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
1821 state->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
1822 state->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP &&
1823 (!two_sided_stencil ||
1824 (state->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP &&
1825 state->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP &&
1826 state->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP));
1827
1828 bool stencil_mask_zero =
1829 state->stencil[0].writemask == 0 ||
1830 (!two_sided_stencil || state->stencil[1].writemask == 0);
1831
1832 bool stencil_func_never =
1833 state->stencil[0].func == PIPE_FUNC_NEVER &&
1834 state->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
1835 (!two_sided_stencil ||
1836 (state->stencil[1].func == PIPE_FUNC_NEVER &&
1837 state->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP));
1838
1839 /* Stencil writes enabled? */
1840 if (state->stencil[0].writemask != 0 ||
1841 ((two_sided_stencil && state->stencil[1].writemask != 0) &&
1842 (!stencil_all_keep &&
1843 !stencil_mask_zero &&
1844 !stencil_func_never)))
1845 stencil_write_enabled = true;
1846
1847 cso->ds_write_state = depth_write_enabled || stencil_write_enabled;
1848
1849 cso->alpha_enabled = state->alpha_enabled;
1850 cso->alpha_func = state->alpha_func;
1851 cso->alpha_ref_value = state->alpha_ref_value;
1852 cso->depth_writes_enabled = state->depth_writemask;
1853 cso->depth_test_enabled = state->depth_enabled;
1854 cso->stencil_writes_enabled =
1855 state->stencil[0].writemask != 0 ||
1856 (two_sided_stencil && state->stencil[1].writemask != 0);
1857
1858 /* gallium frontends need to optimize away EQUAL writes for us. */
1859 assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
1860
1861 iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), cso->wmds, wmds) {
1862 wmds.StencilFailOp = state->stencil[0].fail_op;
1863 wmds.StencilPassDepthFailOp = state->stencil[0].zfail_op;
1864 wmds.StencilPassDepthPassOp = state->stencil[0].zpass_op;
1865 wmds.StencilTestFunction =
1866 translate_compare_func(state->stencil[0].func);
1867 wmds.BackfaceStencilFailOp = state->stencil[1].fail_op;
1868 wmds.BackfaceStencilPassDepthFailOp = state->stencil[1].zfail_op;
1869 wmds.BackfaceStencilPassDepthPassOp = state->stencil[1].zpass_op;
1870 wmds.BackfaceStencilTestFunction =
1871 translate_compare_func(state->stencil[1].func);
1872 wmds.DepthTestFunction = translate_compare_func(state->depth_func);
1873 wmds.DoubleSidedStencilEnable = two_sided_stencil;
1874 wmds.StencilTestEnable = state->stencil[0].enabled;
1875 wmds.StencilBufferWriteEnable =
1876 state->stencil[0].writemask != 0 ||
1877 (two_sided_stencil && state->stencil[1].writemask != 0);
1878 wmds.DepthTestEnable = state->depth_enabled;
1879 wmds.DepthBufferWriteEnable = state->depth_writemask;
1880 wmds.StencilTestMask = state->stencil[0].valuemask;
1881 wmds.StencilWriteMask = state->stencil[0].writemask;
1882 wmds.BackfaceStencilTestMask = state->stencil[1].valuemask;
1883 wmds.BackfaceStencilWriteMask = state->stencil[1].writemask;
1884 /* wmds.[Backface]StencilReferenceValue are merged later */
1885 #if GFX_VER >= 12
1886 wmds.StencilReferenceValueModifyDisable = true;
1887 #endif
1888 }
1889
1890 #if GFX_VER >= 12
1891 iris_pack_command(GENX(3DSTATE_DEPTH_BOUNDS), cso->depth_bounds, depth_bounds) {
1892 depth_bounds.DepthBoundsTestValueModifyDisable = false;
1893 depth_bounds.DepthBoundsTestEnableModifyDisable = false;
1894 depth_bounds.DepthBoundsTestEnable = state->depth_bounds_test;
1895 depth_bounds.DepthBoundsTestMinValue = state->depth_bounds_min;
1896 depth_bounds.DepthBoundsTestMaxValue = state->depth_bounds_max;
1897 }
1898 #endif
1899
1900 return cso;
1901 }
1902
1903 /**
1904 * The pipe->bind_depth_stencil_alpha_state() driver hook.
1905 *
1906 * Bind a depth/stencil/alpha CSO and flag related dirty bits.
1907 */
1908 static void
iris_bind_zsa_state(struct pipe_context * ctx,void * state)1909 iris_bind_zsa_state(struct pipe_context *ctx, void *state)
1910 {
1911 struct iris_context *ice = (struct iris_context *) ctx;
1912 struct iris_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
1913 struct iris_depth_stencil_alpha_state *new_cso = state;
1914
1915 if (new_cso) {
1916 if (cso_changed(alpha_ref_value))
1917 ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
1918
1919 if (cso_changed(alpha_enabled))
1920 ice->state.dirty |= IRIS_DIRTY_PS_BLEND | IRIS_DIRTY_BLEND_STATE;
1921
1922 if (cso_changed(alpha_func))
1923 ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
1924
1925 if (cso_changed(depth_writes_enabled) || cso_changed(stencil_writes_enabled))
1926 ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1927
1928 ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
1929 ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
1930
1931 /* State ds_write_enable changed, need to flag dirty DS. */
1932 if (!old_cso || (ice->state.ds_write_state != new_cso->ds_write_state)) {
1933 ice->state.dirty |= IRIS_DIRTY_DS_WRITE_ENABLE;
1934 ice->state.ds_write_state = new_cso->ds_write_state;
1935 }
1936
1937 #if GFX_VER >= 12
1938 if (cso_changed(depth_bounds))
1939 ice->state.dirty |= IRIS_DIRTY_DEPTH_BOUNDS;
1940 #endif
1941 }
1942
1943 ice->state.cso_zsa = new_cso;
1944 ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
1945 ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL;
1946 ice->state.stage_dirty |=
1947 ice->state.stage_dirty_for_nos[IRIS_NOS_DEPTH_STENCIL_ALPHA];
1948
1949 if (GFX_VER == 8)
1950 ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
1951 }
1952
1953 #if GFX_VER == 8
1954 static bool
want_pma_fix(struct iris_context * ice)1955 want_pma_fix(struct iris_context *ice)
1956 {
1957 UNUSED struct iris_screen *screen = (void *) ice->ctx.screen;
1958 UNUSED const struct intel_device_info *devinfo = screen->devinfo;
1959 const struct iris_fs_data *fs_data =
1960 iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
1961 const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
1962 const struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
1963 const struct iris_blend_state *cso_blend = ice->state.cso_blend;
1964
1965 /* In very specific combinations of state, we can instruct Gfx8-9 hardware
1966 * to avoid stalling at the pixel mask array. The state equations are
1967 * documented in these places:
1968 *
1969 * - Gfx8 Depth PMA Fix: CACHE_MODE_1::NP_PMA_FIX_ENABLE
1970 * - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable
1971 *
1972 * Both equations share some common elements:
1973 *
1974 * no_hiz_op =
1975 * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1976 * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1977 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1978 * 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
1979 *
1980 * killpixels =
1981 * 3DSTATE_WM::ForceKillPix != ForceOff &&
1982 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1983 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1984 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1985 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
1986 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1987 *
1988 * (Technically the stencil PMA treats ForceKillPix differently,
1989 * but I think this is a documentation oversight, and we don't
1990 * ever use it in this way, so it doesn't matter).
1991 *
1992 * common_pma_fix =
1993 * 3DSTATE_WM::ForceThreadDispatch != 1 &&
1994 * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&
1995 * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1996 * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1997 * 3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&
1998 * 3DSTATE_PS_EXTRA::PixelShaderValid &&
1999 * no_hiz_op
2000 *
2001 * These are always true:
2002 *
2003 * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0
2004 * 3DSTATE_PS_EXTRA::PixelShaderValid
2005 *
2006 * Also, we never use the normal drawing path for HiZ ops; these are true:
2007 *
2008 * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
2009 * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
2010 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
2011 * 3DSTATE_WM_HZ_OP::StencilBufferClear)
2012 *
2013 * This happens sometimes:
2014 *
2015 * 3DSTATE_WM::ForceThreadDispatch != 1
2016 *
2017 * However, we choose to ignore it as it either agrees with the signal
2018 * (dispatch was already enabled, so nothing out of the ordinary), or
2019 * there are no framebuffer attachments (so no depth or HiZ anyway,
2020 * meaning the PMA signal will already be disabled).
2021 */
2022
2023 if (!cso_fb->zsbuf)
2024 return false;
2025
2026 struct iris_resource *zres, *sres;
2027 iris_get_depth_stencil_resources(cso_fb->zsbuf->texture, &zres, &sres);
2028
2029 /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
2030 * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
2031 */
2032 if (!zres ||
2033 !iris_resource_level_has_hiz(devinfo, zres, cso_fb->zsbuf->u.tex.level))
2034 return false;
2035
2036 /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */
2037 if (fs_data->early_fragment_tests)
2038 return false;
2039
2040 /* 3DSTATE_WM::ForceKillPix != ForceOff &&
2041 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
2042 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
2043 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
2044 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
2045 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
2046 */
2047 bool killpixels = fs_data->uses_kill || fs_data->uses_omask ||
2048 cso_blend->alpha_to_coverage || cso_zsa->alpha_enabled;
2049
2050 /* The Gfx8 depth PMA equation becomes:
2051 *
2052 * depth_writes =
2053 * 3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
2054 * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE
2055 *
2056 * stencil_writes =
2057 * 3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
2058 * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
2059 * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
2060 *
2061 * Z_PMA_OPT =
2062 * common_pma_fix &&
2063 * 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&
2064 * ((killpixels && (depth_writes || stencil_writes)) ||
2065 * 3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)
2066 *
2067 */
2068 if (!cso_zsa->depth_test_enabled)
2069 return false;
2070
2071 return fs_data->computed_depth_mode != PSCDEPTH_OFF ||
2072 (killpixels && (cso_zsa->depth_writes_enabled ||
2073 (sres && cso_zsa->stencil_writes_enabled)));
2074 }
2075 #endif
2076
2077 void
genX(update_pma_fix)2078 genX(update_pma_fix)(struct iris_context *ice,
2079 struct iris_batch *batch,
2080 bool enable)
2081 {
2082 #if GFX_VER == 8
2083 struct iris_genx_state *genx = ice->state.genx;
2084
2085 if (genx->pma_fix_enabled == enable)
2086 return;
2087
2088 genx->pma_fix_enabled = enable;
2089
2090 /* According to the Broadwell PIPE_CONTROL documentation, software should
2091 * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
2092 * prior to the LRI. If stencil buffer writes are enabled, then a Render * Cache Flush is also necessary.
2093 *
2094 * The Gfx9 docs say to use a depth stall rather than a command streamer
2095 * stall. However, the hardware seems to violently disagree. A full
2096 * command streamer stall seems to be needed in both cases.
2097 */
2098 iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
2099 PIPE_CONTROL_CS_STALL |
2100 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2101 PIPE_CONTROL_RENDER_TARGET_FLUSH);
2102
2103 iris_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
2104 reg.NPPMAFixEnable = enable;
2105 reg.NPEarlyZFailsDisable = enable;
2106 reg.NPPMAFixEnableMask = true;
2107 reg.NPEarlyZFailsDisableMask = true;
2108 }
2109
2110 /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
2111 * Flush bits is often necessary. We do it regardless because it's easier.
2112 * The render cache flush is also necessary if stencil writes are enabled.
2113 *
2114 * Again, the Gfx9 docs give a different set of flushes but the Broadwell
2115 * flushes seem to work just as well.
2116 */
2117 iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
2118 PIPE_CONTROL_DEPTH_STALL |
2119 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2120 PIPE_CONTROL_RENDER_TARGET_FLUSH);
2121 #endif
2122 }
2123
2124 /**
2125 * Gallium CSO for rasterizer state.
2126 */
2127 struct iris_rasterizer_state {
2128 uint32_t sf[GENX(3DSTATE_SF_length)];
2129 uint32_t clip[GENX(3DSTATE_CLIP_length)];
2130 uint32_t raster[GENX(3DSTATE_RASTER_length)];
2131 uint32_t wm[GENX(3DSTATE_WM_length)];
2132 uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
2133
2134 uint8_t num_clip_plane_consts;
2135 bool clip_halfz; /* for CC_VIEWPORT */
2136 bool depth_clip_near; /* for CC_VIEWPORT */
2137 bool depth_clip_far; /* for CC_VIEWPORT */
2138 bool flatshade; /* for shader state */
2139 bool flatshade_first; /* for stream output */
2140 bool clamp_fragment_color; /* for shader state */
2141 bool light_twoside; /* for shader state */
2142 bool rasterizer_discard; /* for 3DSTATE_STREAMOUT and 3DSTATE_CLIP */
2143 bool half_pixel_center; /* for 3DSTATE_MULTISAMPLE */
2144 bool line_smooth;
2145 bool line_stipple_enable;
2146 bool poly_stipple_enable;
2147 bool multisample;
2148 bool force_persample_interp;
2149 bool conservative_rasterization;
2150 bool fill_mode_point;
2151 bool fill_mode_line;
2152 bool fill_mode_point_or_line;
2153 enum pipe_sprite_coord_mode sprite_coord_mode; /* PIPE_SPRITE_* */
2154 uint16_t sprite_coord_enable;
2155 };
2156
2157 static float
get_line_width(const struct pipe_rasterizer_state * state)2158 get_line_width(const struct pipe_rasterizer_state *state)
2159 {
2160 float line_width = state->line_width;
2161
2162 /* From the OpenGL 4.4 spec:
2163 *
2164 * "The actual width of non-antialiased lines is determined by rounding
2165 * the supplied width to the nearest integer, then clamping it to the
2166 * implementation-dependent maximum non-antialiased line width."
2167 */
2168 if (!state->multisample && !state->line_smooth)
2169 line_width = roundf(state->line_width);
2170
2171 if (!state->multisample && state->line_smooth && line_width < 1.5f) {
2172 /* For 1 pixel line thickness or less, the general anti-aliasing
2173 * algorithm gives up, and a garbage line is generated. Setting a
2174 * Line Width of 0.0 specifies the rasterization of the "thinnest"
2175 * (one-pixel-wide), non-antialiased lines.
2176 *
2177 * Lines rendered with zero Line Width are rasterized using the
2178 * "Grid Intersection Quantization" rules as specified by the
2179 * "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
2180 */
2181 line_width = 0.0f;
2182 }
2183
2184 return line_width;
2185 }
2186
2187 /**
2188 * The pipe->create_rasterizer_state() driver hook.
2189 */
2190 static void *
iris_create_rasterizer_state(struct pipe_context * ctx,const struct pipe_rasterizer_state * state)2191 iris_create_rasterizer_state(struct pipe_context *ctx,
2192 const struct pipe_rasterizer_state *state)
2193 {
2194 struct iris_rasterizer_state *cso =
2195 malloc(sizeof(struct iris_rasterizer_state));
2196
2197 cso->multisample = state->multisample;
2198 cso->force_persample_interp = state->force_persample_interp;
2199 cso->clip_halfz = state->clip_halfz;
2200 cso->depth_clip_near = state->depth_clip_near;
2201 cso->depth_clip_far = state->depth_clip_far;
2202 cso->flatshade = state->flatshade;
2203 cso->flatshade_first = state->flatshade_first;
2204 cso->clamp_fragment_color = state->clamp_fragment_color;
2205 cso->light_twoside = state->light_twoside;
2206 cso->rasterizer_discard = state->rasterizer_discard;
2207 cso->half_pixel_center = state->half_pixel_center;
2208 cso->sprite_coord_mode = state->sprite_coord_mode;
2209 cso->sprite_coord_enable = state->sprite_coord_enable;
2210 cso->line_smooth = state->line_smooth;
2211 cso->line_stipple_enable = state->line_stipple_enable;
2212 cso->poly_stipple_enable = state->poly_stipple_enable;
2213 cso->conservative_rasterization =
2214 state->conservative_raster_mode == PIPE_CONSERVATIVE_RASTER_POST_SNAP;
2215
2216 cso->fill_mode_point =
2217 state->fill_front == PIPE_POLYGON_MODE_POINT ||
2218 state->fill_back == PIPE_POLYGON_MODE_POINT;
2219 cso->fill_mode_line =
2220 state->fill_front == PIPE_POLYGON_MODE_LINE ||
2221 state->fill_back == PIPE_POLYGON_MODE_LINE;
2222 cso->fill_mode_point_or_line =
2223 cso->fill_mode_point ||
2224 cso->fill_mode_line;
2225
2226 if (state->clip_plane_enable != 0)
2227 cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
2228 else
2229 cso->num_clip_plane_consts = 0;
2230
2231 float line_width = get_line_width(state);
2232
2233 iris_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
2234 sf.StatisticsEnable = true;
2235 sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
2236 sf.LineEndCapAntialiasingRegionWidth =
2237 state->line_smooth ? _10pixels : _05pixels;
2238 sf.LastPixelEnable = state->line_last_pixel;
2239 sf.LineWidth = line_width;
2240 sf.SmoothPointEnable = (state->point_smooth || state->multisample) &&
2241 !state->point_quad_rasterization;
2242 sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
2243 sf.PointWidth = CLAMP(state->point_size, 0.125f, 255.875f);
2244
2245 if (state->flatshade_first) {
2246 sf.TriangleFanProvokingVertexSelect = 1;
2247 } else {
2248 sf.TriangleStripListProvokingVertexSelect = 2;
2249 sf.TriangleFanProvokingVertexSelect = 2;
2250 sf.LineStripListProvokingVertexSelect = 1;
2251 }
2252 }
2253
2254 iris_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {
2255 rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;
2256 rr.CullMode = translate_cull_mode(state->cull_face);
2257 rr.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2258 rr.BackFaceFillMode = translate_fill_mode(state->fill_back);
2259 rr.DXMultisampleRasterizationEnable = state->multisample;
2260 rr.GlobalDepthOffsetEnableSolid = state->offset_tri;
2261 rr.GlobalDepthOffsetEnableWireframe = state->offset_line;
2262 rr.GlobalDepthOffsetEnablePoint = state->offset_point;
2263 rr.GlobalDepthOffsetConstant = state->offset_units * 2;
2264 rr.GlobalDepthOffsetScale = state->offset_scale;
2265 rr.GlobalDepthOffsetClamp = state->offset_clamp;
2266 rr.SmoothPointEnable = state->point_smooth;
2267 rr.ScissorRectangleEnable = state->scissor;
2268 #if GFX_VER >= 9
2269 rr.ViewportZNearClipTestEnable = state->depth_clip_near;
2270 rr.ViewportZFarClipTestEnable = state->depth_clip_far;
2271 rr.ConservativeRasterizationEnable =
2272 cso->conservative_rasterization;
2273 #else
2274 rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2275 #endif
2276 }
2277
2278 iris_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
2279 /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
2280 * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
2281 */
2282 cl.EarlyCullEnable = true;
2283 cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
2284 cl.ForceUserClipDistanceClipTestEnableBitmask = true;
2285 cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
2286 cl.GuardbandClipTestEnable = true;
2287 cl.ClipEnable = true;
2288 cl.MinimumPointWidth = 0.125;
2289 cl.MaximumPointWidth = 255.875;
2290
2291 if (state->flatshade_first) {
2292 cl.TriangleFanProvokingVertexSelect = 1;
2293 } else {
2294 cl.TriangleStripListProvokingVertexSelect = 2;
2295 cl.TriangleFanProvokingVertexSelect = 2;
2296 cl.LineStripListProvokingVertexSelect = 1;
2297 }
2298 }
2299
2300 iris_pack_command(GENX(3DSTATE_WM), cso->wm, wm) {
2301 /* wm.BarycentricInterpolationMode and wm.EarlyDepthStencilControl are
2302 * filled in at draw time from the FS program.
2303 */
2304 wm.LineAntialiasingRegionWidth = _10pixels;
2305 wm.LineEndCapAntialiasingRegionWidth = _05pixels;
2306 wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
2307 wm.LineStippleEnable = state->line_stipple_enable;
2308 wm.PolygonStippleEnable = state->poly_stipple_enable;
2309 }
2310
2311 /* Remap from 0..255 back to 1..256 */
2312 const unsigned line_stipple_factor = state->line_stipple_factor + 1;
2313
2314 iris_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
2315 if (state->line_stipple_enable) {
2316 line.LineStipplePattern = state->line_stipple_pattern;
2317 line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
2318 line.LineStippleRepeatCount = line_stipple_factor;
2319 }
2320 }
2321
2322 return cso;
2323 }
2324
2325 /**
2326 * The pipe->bind_rasterizer_state() driver hook.
2327 *
2328 * Bind a rasterizer CSO and flag related dirty bits.
2329 */
2330 static void
iris_bind_rasterizer_state(struct pipe_context * ctx,void * state)2331 iris_bind_rasterizer_state(struct pipe_context *ctx, void *state)
2332 {
2333 struct iris_context *ice = (struct iris_context *) ctx;
2334 struct iris_rasterizer_state *old_cso = ice->state.cso_rast;
2335 struct iris_rasterizer_state *new_cso = state;
2336
2337 if (new_cso) {
2338 /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
2339 if (cso_changed_memcmp(line_stipple))
2340 ice->state.dirty |= IRIS_DIRTY_LINE_STIPPLE;
2341
2342 if (cso_changed(half_pixel_center))
2343 ice->state.dirty |= IRIS_DIRTY_MULTISAMPLE;
2344
2345 if (cso_changed(line_stipple_enable) || cso_changed(poly_stipple_enable))
2346 ice->state.dirty |= IRIS_DIRTY_WM;
2347
2348 if (cso_changed(rasterizer_discard))
2349 ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
2350
2351 if (cso_changed(flatshade_first))
2352 ice->state.dirty |= IRIS_DIRTY_STREAMOUT;
2353
2354 if (cso_changed(depth_clip_near) || cso_changed(depth_clip_far) ||
2355 cso_changed(clip_halfz))
2356 ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
2357
2358 if (cso_changed(sprite_coord_enable) ||
2359 cso_changed(sprite_coord_mode) ||
2360 cso_changed(light_twoside))
2361 ice->state.dirty |= IRIS_DIRTY_SBE;
2362
2363 if (cso_changed(conservative_rasterization))
2364 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_FS;
2365 }
2366
2367 ice->state.cso_rast = new_cso;
2368 ice->state.dirty |= IRIS_DIRTY_RASTER;
2369 ice->state.dirty |= IRIS_DIRTY_CLIP;
2370 ice->state.stage_dirty |=
2371 ice->state.stage_dirty_for_nos[IRIS_NOS_RASTERIZER];
2372 }
2373
2374 /**
2375 * Return true if the given wrap mode requires the border color to exist.
2376 *
2377 * (We can skip uploading it if the sampler isn't going to use it.)
2378 */
2379 static bool
wrap_mode_needs_border_color(unsigned wrap_mode)2380 wrap_mode_needs_border_color(unsigned wrap_mode)
2381 {
2382 return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER;
2383 }
2384
2385 /**
2386 * Gallium CSO for sampler state.
2387 */
2388 struct iris_sampler_state {
2389 union pipe_color_union border_color;
2390 bool needs_border_color;
2391
2392 uint32_t sampler_state[GENX(SAMPLER_STATE_length)];
2393
2394 #if GFX_VERx10 == 125
2395 /* Sampler state structure to use for 3D textures in order to
2396 * implement Wa_14014414195.
2397 */
2398 uint32_t sampler_state_3d[GENX(SAMPLER_STATE_length)];
2399 #endif
2400 };
2401
2402 static void
fill_sampler_state(uint32_t * sampler_state,const struct pipe_sampler_state * state,unsigned max_anisotropy)2403 fill_sampler_state(uint32_t *sampler_state,
2404 const struct pipe_sampler_state *state,
2405 unsigned max_anisotropy)
2406 {
2407 float min_lod = state->min_lod;
2408 unsigned mag_img_filter = state->mag_img_filter;
2409
2410 // XXX: explain this code ported from ilo...I don't get it at all...
2411 if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
2412 state->min_lod > 0.0f) {
2413 min_lod = 0.0f;
2414 mag_img_filter = state->min_img_filter;
2415 }
2416
2417 iris_pack_state(GENX(SAMPLER_STATE), sampler_state, samp) {
2418 samp.TCXAddressControlMode = translate_wrap(state->wrap_s);
2419 samp.TCYAddressControlMode = translate_wrap(state->wrap_t);
2420 samp.TCZAddressControlMode = translate_wrap(state->wrap_r);
2421 samp.CubeSurfaceControlMode = state->seamless_cube_map;
2422 samp.NonnormalizedCoordinateEnable = state->unnormalized_coords;
2423 samp.MinModeFilter = state->min_img_filter;
2424 samp.MagModeFilter = mag_img_filter;
2425 samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
2426 samp.MaximumAnisotropy = RATIO21;
2427
2428 if (max_anisotropy >= 2) {
2429 if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
2430 samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
2431 samp.AnisotropicAlgorithm = EWAApproximation;
2432 }
2433
2434 if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)
2435 samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
2436
2437 samp.MaximumAnisotropy =
2438 MIN2((max_anisotropy - 2) / 2, RATIO161);
2439 }
2440
2441 /* Set address rounding bits if not using nearest filtering. */
2442 if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
2443 samp.UAddressMinFilterRoundingEnable = true;
2444 samp.VAddressMinFilterRoundingEnable = true;
2445 samp.RAddressMinFilterRoundingEnable = true;
2446 }
2447
2448 if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
2449 samp.UAddressMagFilterRoundingEnable = true;
2450 samp.VAddressMagFilterRoundingEnable = true;
2451 samp.RAddressMagFilterRoundingEnable = true;
2452 }
2453
2454 if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
2455 samp.ShadowFunction = translate_shadow_func(state->compare_func);
2456
2457 const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
2458
2459 samp.LODPreClampMode = CLAMP_MODE_OGL;
2460 samp.MinLOD = CLAMP(min_lod, 0, hw_max_lod);
2461 samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
2462 samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
2463
2464 /* .BorderColorPointer is filled in by iris_bind_sampler_states. */
2465 }
2466 }
2467
2468 /**
2469 * The pipe->create_sampler_state() driver hook.
2470 *
2471 * We fill out SAMPLER_STATE (except for the border color pointer), and
2472 * store that on the CPU. It doesn't make sense to upload it to a GPU
2473 * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
2474 * all bound sampler states to be in contiguous memor.
2475 */
2476 static void *
iris_create_sampler_state(struct pipe_context * ctx,const struct pipe_sampler_state * state)2477 iris_create_sampler_state(struct pipe_context *ctx,
2478 const struct pipe_sampler_state *state)
2479 {
2480 UNUSED struct iris_screen *screen = (void *)ctx->screen;
2481 UNUSED const struct intel_device_info *devinfo = screen->devinfo;
2482 struct iris_sampler_state *cso = CALLOC_STRUCT(iris_sampler_state);
2483
2484 if (!cso)
2485 return NULL;
2486
2487 STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
2488 STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
2489
2490 unsigned wrap_s = translate_wrap(state->wrap_s);
2491 unsigned wrap_t = translate_wrap(state->wrap_t);
2492 unsigned wrap_r = translate_wrap(state->wrap_r);
2493
2494 memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
2495
2496 cso->needs_border_color = wrap_mode_needs_border_color(wrap_s) ||
2497 wrap_mode_needs_border_color(wrap_t) ||
2498 wrap_mode_needs_border_color(wrap_r);
2499
2500 fill_sampler_state(cso->sampler_state, state, state->max_anisotropy);
2501
2502 #if GFX_VERx10 == 125
2503 /* Fill an extra sampler state structure with anisotropic filtering
2504 * disabled used to implement Wa_14014414195.
2505 */
2506 if (intel_needs_workaround(screen->devinfo, 14014414195))
2507 fill_sampler_state(cso->sampler_state_3d, state, 0);
2508 #endif
2509
2510 return cso;
2511 }
2512
2513 /**
2514 * The pipe->bind_sampler_states() driver hook.
2515 */
2516 static void
iris_bind_sampler_states(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,void ** states)2517 iris_bind_sampler_states(struct pipe_context *ctx,
2518 enum pipe_shader_type p_stage,
2519 unsigned start, unsigned count,
2520 void **states)
2521 {
2522 struct iris_context *ice = (struct iris_context *) ctx;
2523 gl_shader_stage stage = stage_from_pipe(p_stage);
2524 struct iris_shader_state *shs = &ice->state.shaders[stage];
2525
2526 assert(start + count <= IRIS_MAX_SAMPLERS);
2527
2528 bool dirty = false;
2529
2530 for (int i = 0; i < count; i++) {
2531 struct iris_sampler_state *state = states ? states[i] : NULL;
2532 if (shs->samplers[start + i] != state) {
2533 shs->samplers[start + i] = state;
2534 dirty = true;
2535 }
2536 }
2537
2538 if (dirty)
2539 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
2540 }
2541
2542 /**
2543 * Upload the sampler states into a contiguous area of GPU memory, for
2544 * for 3DSTATE_SAMPLER_STATE_POINTERS_*.
2545 *
2546 * Also fill out the border color state pointers.
2547 */
2548 static void
iris_upload_sampler_states(struct iris_context * ice,gl_shader_stage stage)2549 iris_upload_sampler_states(struct iris_context *ice, gl_shader_stage stage)
2550 {
2551 struct iris_screen *screen = (struct iris_screen *) ice->ctx.screen;
2552 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
2553 struct iris_shader_state *shs = &ice->state.shaders[stage];
2554 struct iris_border_color_pool *border_color_pool =
2555 iris_bufmgr_get_border_color_pool(screen->bufmgr);
2556
2557 /* We assume gallium frontends will call pipe->bind_sampler_states()
2558 * if the program's number of textures changes.
2559 */
2560 unsigned count = util_last_bit64(shader->bt.samplers_used_mask);
2561
2562 if (!count)
2563 return;
2564
2565 /* Assemble the SAMPLER_STATEs into a contiguous table that lives
2566 * in the dynamic state memory zone, so we can point to it via the
2567 * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
2568 */
2569 unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
2570 uint32_t *map =
2571 upload_state(ice->state.dynamic_uploader, &shs->sampler_table, size, 32);
2572 if (unlikely(!map))
2573 return;
2574
2575 struct pipe_resource *res = shs->sampler_table.res;
2576 struct iris_bo *bo = iris_resource_bo(res);
2577
2578 iris_record_state_size(ice->state.sizes,
2579 bo->address + shs->sampler_table.offset, size);
2580
2581 shs->sampler_table.offset += iris_bo_offset_from_base_address(bo);
2582
2583 ice->state.need_border_colors &= ~(1 << stage);
2584
2585 for (int i = 0; i < count; i++) {
2586 struct iris_sampler_state *state = shs->samplers[i];
2587 struct iris_sampler_view *tex = shs->textures[i];
2588
2589 if (!state) {
2590 memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
2591 } else {
2592 const uint32_t *sampler_state = state->sampler_state;
2593
2594 #if GFX_VERx10 == 125
2595 if (intel_needs_workaround(screen->devinfo, 14014414195) &&
2596 tex && tex->res->base.b.target == PIPE_TEXTURE_3D) {
2597 sampler_state = state->sampler_state_3d;
2598 }
2599 #endif
2600
2601 if (!state->needs_border_color) {
2602 memcpy(map, sampler_state, 4 * GENX(SAMPLER_STATE_length));
2603 } else {
2604 ice->state.need_border_colors |= 1 << stage;
2605
2606 /* We may need to swizzle the border color for format faking.
2607 * A/LA formats are faked as R/RG with 000R or R00G swizzles.
2608 * This means we need to move the border color's A channel into
2609 * the R or G channels so that those read swizzles will move it
2610 * back into A.
2611 */
2612 union pipe_color_union *color = &state->border_color;
2613 union pipe_color_union tmp;
2614 if (tex) {
2615 enum pipe_format internal_format = tex->res->internal_format;
2616
2617 if (util_format_is_alpha(internal_format)) {
2618 unsigned char swz[4] = {
2619 PIPE_SWIZZLE_W, PIPE_SWIZZLE_0,
2620 PIPE_SWIZZLE_0, PIPE_SWIZZLE_0
2621 };
2622 util_format_apply_color_swizzle(&tmp, color, swz, true);
2623 color = &tmp;
2624 } else if (util_format_is_luminance_alpha(internal_format) &&
2625 internal_format != PIPE_FORMAT_L8A8_SRGB) {
2626 unsigned char swz[4] = {
2627 PIPE_SWIZZLE_X, PIPE_SWIZZLE_W,
2628 PIPE_SWIZZLE_0, PIPE_SWIZZLE_0
2629 };
2630 util_format_apply_color_swizzle(&tmp, color, swz, true);
2631 color = &tmp;
2632 }
2633 }
2634
2635 /* Stream out the border color and merge the pointer. */
2636 uint32_t offset = iris_upload_border_color(border_color_pool,
2637 color);
2638
2639 uint32_t dynamic[GENX(SAMPLER_STATE_length)];
2640 iris_pack_state(GENX(SAMPLER_STATE), dynamic, dyns) {
2641 dyns.BorderColorPointer = offset;
2642 }
2643
2644 for (uint32_t j = 0; j < GENX(SAMPLER_STATE_length); j++)
2645 map[j] = sampler_state[j] | dynamic[j];
2646 }
2647 }
2648
2649 map += GENX(SAMPLER_STATE_length);
2650 }
2651 }
2652
2653 static enum isl_channel_select
fmt_swizzle(const struct iris_format_info * fmt,enum pipe_swizzle swz)2654 fmt_swizzle(const struct iris_format_info *fmt, enum pipe_swizzle swz)
2655 {
2656 switch (swz) {
2657 case PIPE_SWIZZLE_X: return fmt->swizzle.r;
2658 case PIPE_SWIZZLE_Y: return fmt->swizzle.g;
2659 case PIPE_SWIZZLE_Z: return fmt->swizzle.b;
2660 case PIPE_SWIZZLE_W: return fmt->swizzle.a;
2661 case PIPE_SWIZZLE_1: return ISL_CHANNEL_SELECT_ONE;
2662 case PIPE_SWIZZLE_0: return ISL_CHANNEL_SELECT_ZERO;
2663 default: unreachable("invalid swizzle");
2664 }
2665 }
2666
2667 static void
fill_buffer_surface_state(struct isl_device * isl_dev,struct iris_resource * res,void * map,enum isl_format format,struct isl_swizzle swizzle,unsigned offset,unsigned size,isl_surf_usage_flags_t usage)2668 fill_buffer_surface_state(struct isl_device *isl_dev,
2669 struct iris_resource *res,
2670 void *map,
2671 enum isl_format format,
2672 struct isl_swizzle swizzle,
2673 unsigned offset,
2674 unsigned size,
2675 isl_surf_usage_flags_t usage)
2676 {
2677 const struct isl_format_layout *fmtl = isl_format_get_layout(format);
2678 const unsigned cpp = format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
2679
2680 /* The ARB_texture_buffer_specification says:
2681 *
2682 * "The number of texels in the buffer texture's texel array is given by
2683 *
2684 * floor(<buffer_size> / (<components> * sizeof(<base_type>)),
2685 *
2686 * where <buffer_size> is the size of the buffer object, in basic
2687 * machine units and <components> and <base_type> are the element count
2688 * and base data type for elements, as specified in Table X.1. The
2689 * number of texels in the texel array is then clamped to the
2690 * implementation-dependent limit MAX_TEXTURE_BUFFER_SIZE_ARB."
2691 *
2692 * We need to clamp the size in bytes to MAX_TEXTURE_BUFFER_SIZE * stride,
2693 * so that when ISL divides by stride to obtain the number of texels, that
2694 * texel count is clamped to MAX_TEXTURE_BUFFER_SIZE.
2695 */
2696 unsigned final_size =
2697 MIN3(size, res->bo->size - res->offset - offset,
2698 IRIS_MAX_TEXTURE_BUFFER_SIZE * cpp);
2699
2700 isl_buffer_fill_state(isl_dev, map,
2701 .address = res->bo->address + res->offset + offset,
2702 .size_B = final_size,
2703 .format = format,
2704 .swizzle = swizzle,
2705 .stride_B = cpp,
2706 .mocs = iris_mocs(res->bo, isl_dev, usage));
2707 }
2708
2709 #define SURFACE_STATE_ALIGNMENT 64
2710
2711 /**
2712 * Allocate several contiguous SURFACE_STATE structures, one for each
2713 * supported auxiliary surface mode. This only allocates the CPU-side
2714 * copy, they will need to be uploaded later after they're filled in.
2715 */
2716 static void
alloc_surface_states(struct iris_surface_state * surf_state,unsigned aux_usages)2717 alloc_surface_states(struct iris_surface_state *surf_state,
2718 unsigned aux_usages)
2719 {
2720 enum { surf_size = 4 * GENX(RENDER_SURFACE_STATE_length) };
2721
2722 /* If this changes, update this to explicitly align pointers */
2723 STATIC_ASSERT(surf_size == SURFACE_STATE_ALIGNMENT);
2724
2725 assert(aux_usages != 0);
2726
2727 /* In case we're re-allocating them... */
2728 free(surf_state->cpu);
2729
2730 surf_state->aux_usages = aux_usages;
2731 surf_state->num_states = util_bitcount(aux_usages);
2732 surf_state->cpu = calloc(surf_state->num_states, surf_size);
2733 surf_state->ref.offset = 0;
2734 pipe_resource_reference(&surf_state->ref.res, NULL);
2735
2736 assert(surf_state->cpu);
2737 }
2738
2739 /**
2740 * Upload the CPU side SURFACE_STATEs into a GPU buffer.
2741 */
2742 static void
upload_surface_states(struct u_upload_mgr * mgr,struct iris_surface_state * surf_state)2743 upload_surface_states(struct u_upload_mgr *mgr,
2744 struct iris_surface_state *surf_state)
2745 {
2746 const unsigned surf_size = 4 * GENX(RENDER_SURFACE_STATE_length);
2747 const unsigned bytes = surf_state->num_states * surf_size;
2748
2749 void *map =
2750 upload_state(mgr, &surf_state->ref, bytes, SURFACE_STATE_ALIGNMENT);
2751
2752 surf_state->ref.offset +=
2753 iris_bo_offset_from_base_address(iris_resource_bo(surf_state->ref.res));
2754
2755 if (map)
2756 memcpy(map, surf_state->cpu, bytes);
2757 }
2758
2759 /**
2760 * Update resource addresses in a set of SURFACE_STATE descriptors,
2761 * and re-upload them if necessary.
2762 */
2763 static bool
update_surface_state_addrs(struct u_upload_mgr * mgr,struct iris_surface_state * surf_state,struct iris_bo * bo)2764 update_surface_state_addrs(struct u_upload_mgr *mgr,
2765 struct iris_surface_state *surf_state,
2766 struct iris_bo *bo)
2767 {
2768 if (surf_state->bo_address == bo->address)
2769 return false;
2770
2771 STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) % 64 == 0);
2772 STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_bits) == 64);
2773
2774 uint64_t *ss_addr = (uint64_t *) &surf_state->cpu[GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) / 32];
2775
2776 /* First, update the CPU copies. We assume no other fields exist in
2777 * the QWord containing Surface Base Address.
2778 */
2779 for (unsigned i = 0; i < surf_state->num_states; i++) {
2780 *ss_addr = *ss_addr - surf_state->bo_address + bo->address;
2781 ss_addr = ((void *) ss_addr) + SURFACE_STATE_ALIGNMENT;
2782 }
2783
2784 /* Next, upload the updated copies to a GPU buffer. */
2785 upload_surface_states(mgr, surf_state);
2786
2787 surf_state->bo_address = bo->address;
2788
2789 return true;
2790 }
2791
2792 /* We should only use this function when it's needed to fill out
2793 * surf with information provided by the pipe_(image|sampler)_view.
2794 * This is only necessary for CL extension cl_khr_image2d_from_buffer.
2795 * This is the reason why ISL_SURF_DIM_2D is hardcoded on dim field.
2796 */
2797 static void
fill_surf_for_tex2d_from_buffer(struct isl_device * isl_dev,enum isl_format format,unsigned width,unsigned height,unsigned row_stride,isl_surf_usage_flags_t usage,struct isl_surf * surf)2798 fill_surf_for_tex2d_from_buffer(struct isl_device *isl_dev,
2799 enum isl_format format,
2800 unsigned width,
2801 unsigned height,
2802 unsigned row_stride,
2803 isl_surf_usage_flags_t usage,
2804 struct isl_surf *surf)
2805 {
2806 const struct isl_format_layout *fmtl = isl_format_get_layout(format);
2807 const unsigned cpp = format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
2808
2809 const struct isl_surf_init_info init_info = {
2810 .dim = ISL_SURF_DIM_2D,
2811 .format = format,
2812 .width = width,
2813 .height = height,
2814 .depth = 1,
2815 .levels = 1,
2816 .array_len = 1,
2817 .samples = 1,
2818 .min_alignment_B = 4,
2819 .row_pitch_B = row_stride * cpp,
2820 .usage = usage,
2821 .tiling_flags = ISL_TILING_LINEAR_BIT,
2822 };
2823
2824 const bool isl_surf_created_successfully =
2825 isl_surf_init_s(isl_dev, surf, &init_info);
2826
2827 assert(isl_surf_created_successfully);
2828 }
2829
2830 static void
fill_surface_state(struct isl_device * isl_dev,void * map,struct iris_resource * res,struct isl_surf * surf,struct isl_view * view,unsigned aux_usage,uint32_t extra_main_offset,uint32_t tile_x_sa,uint32_t tile_y_sa)2831 fill_surface_state(struct isl_device *isl_dev,
2832 void *map,
2833 struct iris_resource *res,
2834 struct isl_surf *surf,
2835 struct isl_view *view,
2836 unsigned aux_usage,
2837 uint32_t extra_main_offset,
2838 uint32_t tile_x_sa,
2839 uint32_t tile_y_sa)
2840 {
2841 struct isl_surf_fill_state_info f = {
2842 .surf = surf,
2843 .view = view,
2844 .mocs = iris_mocs(res->bo, isl_dev, view->usage),
2845 .address = res->bo->address + res->offset + extra_main_offset,
2846 .x_offset_sa = tile_x_sa,
2847 .y_offset_sa = tile_y_sa,
2848 };
2849
2850 if (aux_usage != ISL_AUX_USAGE_NONE) {
2851 f.aux_surf = &res->aux.surf;
2852 f.aux_usage = aux_usage;
2853 f.clear_color = res->aux.clear_color;
2854
2855 if (aux_usage == ISL_AUX_USAGE_MC)
2856 f.mc_format = iris_format_for_usage(isl_dev->info,
2857 res->external_format,
2858 surf->usage).fmt;
2859
2860 if (res->aux.bo)
2861 f.aux_address = res->aux.bo->address + res->aux.offset;
2862
2863 if (res->aux.clear_color_bo) {
2864 f.clear_address = res->aux.clear_color_bo->address +
2865 res->aux.clear_color_offset;
2866 f.use_clear_address = isl_dev->info->ver > 9;
2867 }
2868 }
2869
2870 isl_surf_fill_state_s(isl_dev, map, &f);
2871 }
2872
2873 static void
fill_surface_states(struct isl_device * isl_dev,struct iris_surface_state * surf_state,struct iris_resource * res,struct isl_surf * surf,struct isl_view * view,uint64_t extra_main_offset,uint32_t tile_x_sa,uint32_t tile_y_sa)2874 fill_surface_states(struct isl_device *isl_dev,
2875 struct iris_surface_state *surf_state,
2876 struct iris_resource *res,
2877 struct isl_surf *surf,
2878 struct isl_view *view,
2879 uint64_t extra_main_offset,
2880 uint32_t tile_x_sa,
2881 uint32_t tile_y_sa)
2882 {
2883 void *map = surf_state->cpu;
2884 unsigned aux_modes = surf_state->aux_usages;
2885
2886 while (aux_modes) {
2887 enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);
2888
2889 fill_surface_state(isl_dev, map, res, surf, view, aux_usage,
2890 extra_main_offset, tile_x_sa, tile_y_sa);
2891
2892 map += SURFACE_STATE_ALIGNMENT;
2893 }
2894 }
2895
2896 /**
2897 * The pipe->create_sampler_view() driver hook.
2898 */
2899 static struct pipe_sampler_view *
iris_create_sampler_view(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_sampler_view * tmpl)2900 iris_create_sampler_view(struct pipe_context *ctx,
2901 struct pipe_resource *tex,
2902 const struct pipe_sampler_view *tmpl)
2903 {
2904 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
2905 const struct intel_device_info *devinfo = screen->devinfo;
2906 struct iris_sampler_view *isv = calloc(1, sizeof(struct iris_sampler_view));
2907
2908 if (!isv)
2909 return NULL;
2910
2911 /* initialize base object */
2912 isv->base = *tmpl;
2913 isv->base.context = ctx;
2914 isv->base.texture = NULL;
2915 pipe_reference_init(&isv->base.reference, 1);
2916 pipe_resource_reference(&isv->base.texture, tex);
2917
2918 if (util_format_is_depth_or_stencil(tmpl->format)) {
2919 struct iris_resource *zres, *sres;
2920 const struct util_format_description *desc =
2921 util_format_description(tmpl->format);
2922
2923 iris_get_depth_stencil_resources(tex, &zres, &sres);
2924
2925 tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;
2926 }
2927
2928 isv->res = (struct iris_resource *) tex;
2929
2930 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
2931
2932 if (isv->base.target == PIPE_TEXTURE_CUBE ||
2933 isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
2934 usage |= ISL_SURF_USAGE_CUBE_BIT;
2935
2936 const struct iris_format_info fmt =
2937 iris_format_for_usage(devinfo, tmpl->format, usage);
2938
2939 isv->clear_color = isv->res->aux.clear_color;
2940
2941 isv->view = (struct isl_view) {
2942 .format = fmt.fmt,
2943 .swizzle = (struct isl_swizzle) {
2944 .r = fmt_swizzle(&fmt, tmpl->swizzle_r),
2945 .g = fmt_swizzle(&fmt, tmpl->swizzle_g),
2946 .b = fmt_swizzle(&fmt, tmpl->swizzle_b),
2947 .a = fmt_swizzle(&fmt, tmpl->swizzle_a),
2948 },
2949 .usage = usage,
2950 };
2951
2952 unsigned aux_usages = 0;
2953
2954 if ((isv->res->aux.usage == ISL_AUX_USAGE_CCS_D ||
2955 isv->res->aux.usage == ISL_AUX_USAGE_CCS_E ||
2956 isv->res->aux.usage == ISL_AUX_USAGE_FCV_CCS_E) &&
2957 !isl_format_supports_ccs_e(devinfo, isv->view.format)) {
2958 aux_usages = 1 << ISL_AUX_USAGE_NONE;
2959 } else if (isl_aux_usage_has_hiz(isv->res->aux.usage) &&
2960 !iris_sample_with_depth_aux(devinfo, isv->res)) {
2961 aux_usages = 1 << ISL_AUX_USAGE_NONE;
2962 } else {
2963 aux_usages = 1 << ISL_AUX_USAGE_NONE |
2964 1 << isv->res->aux.usage;
2965 }
2966
2967 alloc_surface_states(&isv->surface_state, aux_usages);
2968 isv->surface_state.bo_address = isv->res->bo->address;
2969
2970 /* Fill out SURFACE_STATE for this view. */
2971 if (tmpl->target != PIPE_BUFFER) {
2972 isv->view.base_level = tmpl->u.tex.first_level;
2973 isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
2974
2975 if (tmpl->target == PIPE_TEXTURE_3D) {
2976 isv->view.base_array_layer = 0;
2977 isv->view.array_len = 1;
2978 } else {
2979 #if GFX_VER < 9
2980 /* Hardware older than skylake ignores this value */
2981 assert(tex->target != PIPE_TEXTURE_3D || !tmpl->u.tex.first_layer);
2982 #endif
2983 isv->view.base_array_layer = tmpl->u.tex.first_layer;
2984 isv->view.array_len =
2985 tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2986 }
2987
2988 fill_surface_states(&screen->isl_dev, &isv->surface_state, isv->res,
2989 &isv->res->surf, &isv->view, 0, 0, 0);
2990 } else if (isv->base.is_tex2d_from_buf) {
2991 /* In case it's a 2d image created from a buffer, we should
2992 * use fill_surface_states function with image parameters provided
2993 * by the CL application
2994 */
2995 isv->view.base_array_layer = 0;
2996 isv->view.array_len = 1;
2997
2998 /* Create temp_surf and fill with values provided by CL application */
2999 struct isl_surf temp_surf;
3000 fill_surf_for_tex2d_from_buffer(&screen->isl_dev, fmt.fmt,
3001 isv->base.u.tex2d_from_buf.width,
3002 isv->base.u.tex2d_from_buf.height,
3003 isv->base.u.tex2d_from_buf.row_stride,
3004 usage,
3005 &temp_surf);
3006
3007 fill_surface_states(&screen->isl_dev, &isv->surface_state, isv->res,
3008 &temp_surf, &isv->view, 0, 0, 0);
3009 } else {
3010 fill_buffer_surface_state(&screen->isl_dev, isv->res,
3011 isv->surface_state.cpu,
3012 isv->view.format, isv->view.swizzle,
3013 tmpl->u.buf.offset, tmpl->u.buf.size,
3014 ISL_SURF_USAGE_TEXTURE_BIT);
3015 }
3016
3017 return &isv->base;
3018 }
3019
3020 static void
iris_sampler_view_destroy(struct pipe_context * ctx,struct pipe_sampler_view * state)3021 iris_sampler_view_destroy(struct pipe_context *ctx,
3022 struct pipe_sampler_view *state)
3023 {
3024 struct iris_sampler_view *isv = (void *) state;
3025 pipe_resource_reference(&state->texture, NULL);
3026 pipe_resource_reference(&isv->surface_state.ref.res, NULL);
3027 free(isv->surface_state.cpu);
3028 free(isv);
3029 }
3030
3031 /**
3032 * The pipe->create_surface() driver hook.
3033 *
3034 * In Gallium nomenclature, "surfaces" are a view of a resource that
3035 * can be bound as a render target or depth/stencil buffer.
3036 */
3037 static struct pipe_surface *
iris_create_surface(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_surface * tmpl)3038 iris_create_surface(struct pipe_context *ctx,
3039 struct pipe_resource *tex,
3040 const struct pipe_surface *tmpl)
3041 {
3042 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3043 const struct intel_device_info *devinfo = screen->devinfo;
3044
3045 isl_surf_usage_flags_t usage = 0;
3046 if (tmpl->writable)
3047 usage = ISL_SURF_USAGE_STORAGE_BIT;
3048 else if (util_format_is_depth_or_stencil(tmpl->format))
3049 usage = ISL_SURF_USAGE_DEPTH_BIT;
3050 else
3051 usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
3052
3053 const struct iris_format_info fmt =
3054 iris_format_for_usage(devinfo, tmpl->format, usage);
3055
3056 if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
3057 !isl_format_supports_rendering(devinfo, fmt.fmt)) {
3058 /* Framebuffer validation will reject this invalid case, but it
3059 * hasn't had the opportunity yet. In the meantime, we need to
3060 * avoid hitting ISL asserts about unsupported formats below.
3061 */
3062 return NULL;
3063 }
3064
3065 struct iris_surface *surf = calloc(1, sizeof(struct iris_surface));
3066 struct iris_resource *res = (struct iris_resource *) tex;
3067
3068 if (!surf)
3069 return NULL;
3070
3071 uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
3072
3073 struct isl_view *view = &surf->view;
3074 *view = (struct isl_view) {
3075 .format = fmt.fmt,
3076 .base_level = tmpl->u.tex.level,
3077 .levels = 1,
3078 .base_array_layer = tmpl->u.tex.first_layer,
3079 .array_len = array_len,
3080 .swizzle = ISL_SWIZZLE_IDENTITY,
3081 .usage = usage,
3082 };
3083
3084 #if GFX_VER == 8
3085 struct isl_view *read_view = &surf->read_view;
3086 *read_view = (struct isl_view) {
3087 .format = fmt.fmt,
3088 .base_level = tmpl->u.tex.level,
3089 .levels = 1,
3090 .base_array_layer = tmpl->u.tex.first_layer,
3091 .array_len = array_len,
3092 .swizzle = ISL_SWIZZLE_IDENTITY,
3093 .usage = ISL_SURF_USAGE_TEXTURE_BIT,
3094 };
3095
3096 struct isl_surf read_surf = res->surf;
3097 uint64_t read_surf_offset_B = 0;
3098 uint32_t read_surf_tile_x_sa = 0, read_surf_tile_y_sa = 0;
3099 if (tex->target == PIPE_TEXTURE_3D && array_len == 1) {
3100 /* The minimum array element field of the surface state structure is
3101 * ignored by the sampler unit for 3D textures on some hardware. If the
3102 * render buffer is a single slice of a 3D texture, create a 2D texture
3103 * covering that slice.
3104 *
3105 * TODO: This only handles the case where we're rendering to a single
3106 * slice of an array texture. If we have layered rendering combined
3107 * with non-coherent FB fetch and a non-zero base_array_layer, then
3108 * we're going to run into problems.
3109 *
3110 * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/4904
3111 */
3112 isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
3113 read_view->base_level,
3114 0, read_view->base_array_layer,
3115 &read_surf, &read_surf_offset_B,
3116 &read_surf_tile_x_sa, &read_surf_tile_y_sa);
3117 read_view->base_level = 0;
3118 read_view->base_array_layer = 0;
3119 assert(read_view->array_len == 1);
3120 } else if (tex->target == PIPE_TEXTURE_1D_ARRAY) {
3121 /* Convert 1D array textures to 2D arrays because shaders always provide
3122 * the array index coordinate at the Z component to avoid recompiles
3123 * when changing the texture target of the framebuffer.
3124 */
3125 assert(read_surf.dim_layout == ISL_DIM_LAYOUT_GFX4_2D);
3126 read_surf.dim = ISL_SURF_DIM_2D;
3127 }
3128 #endif
3129
3130 struct isl_surf isl_surf = res->surf;
3131 uint64_t offset_B = 0;
3132 uint32_t tile_x_el = 0, tile_y_el = 0;
3133 if (isl_format_is_compressed(res->surf.format)) {
3134 /* The resource has a compressed format, which is not renderable, but we
3135 * have a renderable view format. We must be attempting to upload
3136 * blocks of compressed data via an uncompressed view.
3137 *
3138 * In this case, we can assume there are no auxiliary surfaces, a single
3139 * miplevel, and that the resource is single-sampled. Gallium may try
3140 * and create an uncompressed view with multiple layers, however.
3141 */
3142 assert(res->aux.surf.size_B == 0);
3143 assert(res->surf.samples == 1);
3144 assert(view->levels == 1);
3145
3146 bool ok = isl_surf_get_uncompressed_surf(&screen->isl_dev,
3147 &res->surf, view,
3148 &isl_surf, view, &offset_B,
3149 &tile_x_el, &tile_y_el);
3150
3151 /* On Broadwell, HALIGN and VALIGN are specified in pixels and are
3152 * hard-coded to align to exactly the block size of the compressed
3153 * texture. This means that, when reinterpreted as a non-compressed
3154 * texture, the tile offsets may be anything.
3155 *
3156 * We need them to be multiples of 4 to be usable in RENDER_SURFACE_STATE,
3157 * so force the state tracker to take fallback paths if they're not.
3158 */
3159 #if GFX_VER == 8
3160 if (tile_x_el % 4 != 0 || tile_y_el % 4 != 0) {
3161 ok = false;
3162 }
3163 #endif
3164
3165 if (!ok) {
3166 free(surf);
3167 return NULL;
3168 }
3169 }
3170
3171 surf->clear_color = res->aux.clear_color;
3172
3173 struct pipe_surface *psurf = &surf->base;
3174 pipe_reference_init(&psurf->reference, 1);
3175 pipe_resource_reference(&psurf->texture, tex);
3176 psurf->context = ctx;
3177 psurf->format = tmpl->format;
3178 psurf->width = isl_surf.logical_level0_px.width;
3179 psurf->height = isl_surf.logical_level0_px.height;
3180 psurf->texture = tex;
3181 psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
3182 psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
3183 psurf->u.tex.level = tmpl->u.tex.level;
3184
3185 /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
3186 if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
3187 ISL_SURF_USAGE_STENCIL_BIT))
3188 return psurf;
3189
3190 /* Fill out a SURFACE_STATE for each possible auxiliary surface mode and
3191 * return the pipe_surface.
3192 */
3193 unsigned aux_usages = 0;
3194
3195 if ((res->aux.usage == ISL_AUX_USAGE_CCS_E ||
3196 res->aux.usage == ISL_AUX_USAGE_FCV_CCS_E) &&
3197 !isl_format_supports_ccs_e(devinfo, view->format)) {
3198 aux_usages = 1 << ISL_AUX_USAGE_NONE;
3199 } else {
3200 aux_usages = 1 << ISL_AUX_USAGE_NONE |
3201 1 << res->aux.usage;
3202 }
3203
3204 alloc_surface_states(&surf->surface_state, aux_usages);
3205 surf->surface_state.bo_address = res->bo->address;
3206 fill_surface_states(&screen->isl_dev, &surf->surface_state, res,
3207 &isl_surf, view, offset_B, tile_x_el, tile_y_el);
3208
3209 #if GFX_VER == 8
3210 alloc_surface_states(&surf->surface_state_read, aux_usages);
3211 surf->surface_state_read.bo_address = res->bo->address;
3212 fill_surface_states(&screen->isl_dev, &surf->surface_state_read, res,
3213 &read_surf, read_view, read_surf_offset_B,
3214 read_surf_tile_x_sa, read_surf_tile_y_sa);
3215 #endif
3216
3217 return psurf;
3218 }
3219
3220 #if GFX_VER < 9
3221 static void
fill_default_image_param(struct isl_image_param * param)3222 fill_default_image_param(struct isl_image_param *param)
3223 {
3224 memset(param, 0, sizeof(*param));
3225 /* Set the swizzling shifts to all-ones to effectively disable swizzling --
3226 * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
3227 * detailed explanation of these parameters.
3228 */
3229 param->swizzling[0] = 0xff;
3230 param->swizzling[1] = 0xff;
3231 }
3232
3233 static void
fill_buffer_image_param(struct isl_image_param * param,enum pipe_format pfmt,unsigned size)3234 fill_buffer_image_param(struct isl_image_param *param,
3235 enum pipe_format pfmt,
3236 unsigned size)
3237 {
3238 const unsigned cpp = util_format_get_blocksize(pfmt);
3239
3240 fill_default_image_param(param);
3241 param->size[0] = size / cpp;
3242 param->stride[0] = cpp;
3243 }
3244 #else
3245 #define isl_surf_fill_image_param(x, ...)
3246 #define fill_default_image_param(x, ...)
3247 #define fill_buffer_image_param(x, ...)
3248 #endif
3249
3250 /**
3251 * The pipe->set_shader_images() driver hook.
3252 */
3253 static void
iris_set_shader_images(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * p_images)3254 iris_set_shader_images(struct pipe_context *ctx,
3255 enum pipe_shader_type p_stage,
3256 unsigned start_slot, unsigned count,
3257 unsigned unbind_num_trailing_slots,
3258 const struct pipe_image_view *p_images)
3259 {
3260 struct iris_context *ice = (struct iris_context *) ctx;
3261 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3262 gl_shader_stage stage = stage_from_pipe(p_stage);
3263 struct iris_shader_state *shs = &ice->state.shaders[stage];
3264 #if GFX_VER == 8
3265 struct iris_genx_state *genx = ice->state.genx;
3266 struct isl_image_param *image_params = genx->shaders[stage].image_param;
3267 #endif
3268
3269 shs->bound_image_views &=
3270 ~u_bit_consecutive64(start_slot, count + unbind_num_trailing_slots);
3271
3272 for (unsigned i = 0; i < count; i++) {
3273 struct iris_image_view *iv = &shs->image[start_slot + i];
3274
3275 if (p_images && p_images[i].resource) {
3276 const struct pipe_image_view *img = &p_images[i];
3277 struct iris_resource *res = (void *) img->resource;
3278
3279 util_copy_image_view(&iv->base, img);
3280
3281 shs->bound_image_views |= BITFIELD64_BIT(start_slot + i);
3282
3283 res->bind_history |= PIPE_BIND_SHADER_IMAGE;
3284 res->bind_stages |= 1 << stage;
3285
3286 enum isl_format isl_fmt = iris_image_view_get_format(ice, img);
3287
3288 unsigned aux_usages = 1 << ISL_AUX_USAGE_NONE;
3289
3290 /* Gfx12+ supports render compression for images */
3291 if (GFX_VER >= 12 && isl_aux_usage_has_ccs_e(res->aux.usage))
3292 aux_usages |= 1 << ISL_AUX_USAGE_CCS_E;
3293
3294 alloc_surface_states(&iv->surface_state, aux_usages);
3295 iv->surface_state.bo_address = res->bo->address;
3296
3297 if (res->base.b.target != PIPE_BUFFER) {
3298 struct isl_view view = {
3299 .format = isl_fmt,
3300 .base_level = img->u.tex.level,
3301 .levels = 1,
3302 .base_array_layer = img->u.tex.first_layer,
3303 .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
3304 .swizzle = ISL_SWIZZLE_IDENTITY,
3305 .usage = ISL_SURF_USAGE_STORAGE_BIT,
3306 };
3307
3308 /* If using untyped fallback. */
3309 if (isl_fmt == ISL_FORMAT_RAW) {
3310 fill_buffer_surface_state(&screen->isl_dev, res,
3311 iv->surface_state.cpu,
3312 isl_fmt, ISL_SWIZZLE_IDENTITY,
3313 0, res->bo->size,
3314 ISL_SURF_USAGE_STORAGE_BIT);
3315 } else {
3316 fill_surface_states(&screen->isl_dev, &iv->surface_state, res,
3317 &res->surf, &view, 0, 0, 0);
3318 }
3319
3320 isl_surf_fill_image_param(&screen->isl_dev,
3321 &image_params[start_slot + i],
3322 &res->surf, &view);
3323 } else if (img->access & PIPE_IMAGE_ACCESS_TEX2D_FROM_BUFFER) {
3324 /* In case it's a 2d image created from a buffer, we should
3325 * use fill_surface_states function with image parameters provided
3326 * by the CL application
3327 */
3328 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
3329 struct isl_view view = {
3330 .format = isl_fmt,
3331 .base_level = 0,
3332 .levels = 1,
3333 .base_array_layer = 0,
3334 .array_len = 1,
3335 .swizzle = ISL_SWIZZLE_IDENTITY,
3336 .usage = usage,
3337 };
3338
3339 /* Create temp_surf and fill with values provided by CL application */
3340 struct isl_surf temp_surf;
3341 enum isl_format fmt = iris_image_view_get_format(ice, img);
3342 fill_surf_for_tex2d_from_buffer(&screen->isl_dev, fmt,
3343 img->u.tex2d_from_buf.width,
3344 img->u.tex2d_from_buf.height,
3345 img->u.tex2d_from_buf.row_stride,
3346 usage,
3347 &temp_surf);
3348
3349 fill_surface_states(&screen->isl_dev, &iv->surface_state, res,
3350 &temp_surf, &view, 0, 0, 0);
3351 isl_surf_fill_image_param(&screen->isl_dev,
3352 &image_params[start_slot + i],
3353 &temp_surf, &view);
3354 } else {
3355 util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset,
3356 img->u.buf.offset + img->u.buf.size);
3357
3358 fill_buffer_surface_state(&screen->isl_dev, res,
3359 iv->surface_state.cpu,
3360 isl_fmt, ISL_SWIZZLE_IDENTITY,
3361 img->u.buf.offset, img->u.buf.size,
3362 ISL_SURF_USAGE_STORAGE_BIT);
3363 fill_buffer_image_param(&image_params[start_slot + i],
3364 img->format, img->u.buf.size);
3365 }
3366
3367 upload_surface_states(ice->state.surface_uploader, &iv->surface_state);
3368 } else {
3369 pipe_resource_reference(&iv->base.resource, NULL);
3370 pipe_resource_reference(&iv->surface_state.ref.res, NULL);
3371 fill_default_image_param(&image_params[start_slot + i]);
3372 }
3373 }
3374
3375 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << stage;
3376 ice->state.dirty |=
3377 stage == MESA_SHADER_COMPUTE ? IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3378 : IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3379
3380 /* Broadwell also needs isl_image_params re-uploaded */
3381 if (GFX_VER < 9) {
3382 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << stage;
3383 shs->sysvals_need_upload = true;
3384 }
3385
3386 if (unbind_num_trailing_slots) {
3387 iris_set_shader_images(ctx, p_stage, start_slot + count,
3388 unbind_num_trailing_slots, 0, NULL);
3389 }
3390 }
3391
3392 UNUSED static bool
is_sampler_view_3d(const struct iris_sampler_view * view)3393 is_sampler_view_3d(const struct iris_sampler_view *view)
3394 {
3395 return view && view->res->base.b.target == PIPE_TEXTURE_3D;
3396 }
3397
3398 /**
3399 * The pipe->set_sampler_views() driver hook.
3400 */
3401 static void
iris_set_sampler_views(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)3402 iris_set_sampler_views(struct pipe_context *ctx,
3403 enum pipe_shader_type p_stage,
3404 unsigned start, unsigned count,
3405 unsigned unbind_num_trailing_slots,
3406 bool take_ownership,
3407 struct pipe_sampler_view **views)
3408 {
3409 struct iris_context *ice = (struct iris_context *) ctx;
3410 UNUSED struct iris_screen *screen = (void *) ctx->screen;
3411 UNUSED const struct intel_device_info *devinfo = screen->devinfo;
3412 gl_shader_stage stage = stage_from_pipe(p_stage);
3413 struct iris_shader_state *shs = &ice->state.shaders[stage];
3414 unsigned i;
3415
3416 if (count == 0 && unbind_num_trailing_slots == 0)
3417 return;
3418
3419 BITSET_CLEAR_RANGE(shs->bound_sampler_views, start,
3420 start + count + unbind_num_trailing_slots - 1);
3421
3422 for (i = 0; i < count; i++) {
3423 struct pipe_sampler_view *pview = views ? views[i] : NULL;
3424 struct iris_sampler_view *view = (void *) pview;
3425
3426 #if GFX_VERx10 == 125
3427 if (intel_needs_workaround(screen->devinfo, 14014414195)) {
3428 if (is_sampler_view_3d(shs->textures[start + i]) !=
3429 is_sampler_view_3d(view))
3430 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
3431 }
3432 #endif
3433
3434 if (take_ownership) {
3435 pipe_sampler_view_reference((struct pipe_sampler_view **)
3436 &shs->textures[start + i], NULL);
3437 shs->textures[start + i] = (struct iris_sampler_view *)pview;
3438 } else {
3439 pipe_sampler_view_reference((struct pipe_sampler_view **)
3440 &shs->textures[start + i], pview);
3441 }
3442 if (view) {
3443 view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
3444 view->res->bind_stages |= 1 << stage;
3445
3446 BITSET_SET(shs->bound_sampler_views, start + i);
3447
3448 update_surface_state_addrs(ice->state.surface_uploader,
3449 &view->surface_state, view->res->bo);
3450 }
3451 }
3452 for (; i < count + unbind_num_trailing_slots; i++) {
3453 pipe_sampler_view_reference((struct pipe_sampler_view **)
3454 &shs->textures[start + i], NULL);
3455 }
3456
3457 ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_BINDINGS_VS << stage);
3458 ice->state.dirty |=
3459 stage == MESA_SHADER_COMPUTE ? IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3460 : IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3461 }
3462
3463 static void
iris_set_compute_resources(struct pipe_context * ctx,unsigned start,unsigned count,struct pipe_surface ** resources)3464 iris_set_compute_resources(struct pipe_context *ctx,
3465 unsigned start, unsigned count,
3466 struct pipe_surface **resources)
3467 {
3468 assert(count == 0);
3469 }
3470
3471 static void
iris_set_global_binding(struct pipe_context * ctx,unsigned start_slot,unsigned count,struct pipe_resource ** resources,uint32_t ** handles)3472 iris_set_global_binding(struct pipe_context *ctx,
3473 unsigned start_slot, unsigned count,
3474 struct pipe_resource **resources,
3475 uint32_t **handles)
3476 {
3477 struct iris_context *ice = (struct iris_context *) ctx;
3478
3479 assert(start_slot + count <= IRIS_MAX_GLOBAL_BINDINGS);
3480 for (unsigned i = 0; i < count; i++) {
3481 if (resources && resources[i]) {
3482 pipe_resource_reference(&ice->state.global_bindings[start_slot + i],
3483 resources[i]);
3484
3485 struct iris_resource *res = (void *) resources[i];
3486 assert(res->base.b.target == PIPE_BUFFER);
3487 util_range_add(&res->base.b, &res->valid_buffer_range,
3488 0, res->base.b.width0);
3489
3490 uint64_t addr = 0;
3491 memcpy(&addr, handles[i], sizeof(addr));
3492 addr += res->bo->address + res->offset;
3493 memcpy(handles[i], &addr, sizeof(addr));
3494 } else {
3495 pipe_resource_reference(&ice->state.global_bindings[start_slot + i],
3496 NULL);
3497 }
3498 }
3499
3500 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_CS;
3501 }
3502
3503 /**
3504 * The pipe->set_tess_state() driver hook.
3505 */
3506 static void
iris_set_tess_state(struct pipe_context * ctx,const float default_outer_level[4],const float default_inner_level[2])3507 iris_set_tess_state(struct pipe_context *ctx,
3508 const float default_outer_level[4],
3509 const float default_inner_level[2])
3510 {
3511 struct iris_context *ice = (struct iris_context *) ctx;
3512 struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
3513
3514 memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
3515 memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
3516
3517 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_TCS;
3518 shs->sysvals_need_upload = true;
3519 }
3520
3521 static void
iris_set_patch_vertices(struct pipe_context * ctx,uint8_t patch_vertices)3522 iris_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
3523 {
3524 struct iris_context *ice = (struct iris_context *) ctx;
3525
3526 ice->state.patch_vertices = patch_vertices;
3527 }
3528
3529 static void
iris_surface_destroy(struct pipe_context * ctx,struct pipe_surface * p_surf)3530 iris_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
3531 {
3532 struct iris_surface *surf = (void *) p_surf;
3533 pipe_resource_reference(&p_surf->texture, NULL);
3534 pipe_resource_reference(&surf->surface_state.ref.res, NULL);
3535 pipe_resource_reference(&surf->surface_state_read.ref.res, NULL);
3536 free(surf->surface_state.cpu);
3537 free(surf->surface_state_read.cpu);
3538 free(surf);
3539 }
3540
3541 static void
iris_set_clip_state(struct pipe_context * ctx,const struct pipe_clip_state * state)3542 iris_set_clip_state(struct pipe_context *ctx,
3543 const struct pipe_clip_state *state)
3544 {
3545 struct iris_context *ice = (struct iris_context *) ctx;
3546 struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
3547 struct iris_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
3548 struct iris_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
3549
3550 memcpy(&ice->state.clip_planes, state, sizeof(*state));
3551
3552 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS |
3553 IRIS_STAGE_DIRTY_CONSTANTS_GS |
3554 IRIS_STAGE_DIRTY_CONSTANTS_TES;
3555 shs->sysvals_need_upload = true;
3556 gshs->sysvals_need_upload = true;
3557 tshs->sysvals_need_upload = true;
3558 }
3559
3560 /**
3561 * The pipe->set_polygon_stipple() driver hook.
3562 */
3563 static void
iris_set_polygon_stipple(struct pipe_context * ctx,const struct pipe_poly_stipple * state)3564 iris_set_polygon_stipple(struct pipe_context *ctx,
3565 const struct pipe_poly_stipple *state)
3566 {
3567 struct iris_context *ice = (struct iris_context *) ctx;
3568 memcpy(&ice->state.poly_stipple, state, sizeof(*state));
3569 ice->state.dirty |= IRIS_DIRTY_POLYGON_STIPPLE;
3570 }
3571
3572 /**
3573 * The pipe->set_sample_mask() driver hook.
3574 */
3575 static void
iris_set_sample_mask(struct pipe_context * ctx,unsigned sample_mask)3576 iris_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
3577 {
3578 struct iris_context *ice = (struct iris_context *) ctx;
3579
3580 /* We only support 16x MSAA, so we have 16 bits of sample maks.
3581 * st/mesa may pass us 0xffffffff though, meaning "enable all samples".
3582 */
3583 ice->state.sample_mask = sample_mask & 0xffff;
3584 ice->state.dirty |= IRIS_DIRTY_SAMPLE_MASK;
3585 }
3586
3587 /**
3588 * The pipe->set_scissor_states() driver hook.
3589 *
3590 * This corresponds to our SCISSOR_RECT state structures. It's an
3591 * exact match, so we just store them, and memcpy them out later.
3592 */
3593 static void
iris_set_scissor_states(struct pipe_context * ctx,unsigned start_slot,unsigned num_scissors,const struct pipe_scissor_state * rects)3594 iris_set_scissor_states(struct pipe_context *ctx,
3595 unsigned start_slot,
3596 unsigned num_scissors,
3597 const struct pipe_scissor_state *rects)
3598 {
3599 struct iris_context *ice = (struct iris_context *) ctx;
3600
3601 for (unsigned i = 0; i < num_scissors; i++) {
3602 if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
3603 /* If the scissor was out of bounds and got clamped to 0 width/height
3604 * at the bounds, the subtraction of 1 from maximums could produce a
3605 * negative number and thus not clip anything. Instead, just provide
3606 * a min > max scissor inside the bounds, which produces the expected
3607 * no rendering.
3608 */
3609 ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3610 .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
3611 };
3612 } else {
3613 ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3614 .minx = rects[i].minx, .miny = rects[i].miny,
3615 .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
3616 };
3617 }
3618 }
3619
3620 ice->state.dirty |= IRIS_DIRTY_SCISSOR_RECT;
3621 }
3622
3623 /**
3624 * The pipe->set_stencil_ref() driver hook.
3625 *
3626 * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
3627 */
3628 static void
iris_set_stencil_ref(struct pipe_context * ctx,const struct pipe_stencil_ref state)3629 iris_set_stencil_ref(struct pipe_context *ctx,
3630 const struct pipe_stencil_ref state)
3631 {
3632 struct iris_context *ice = (struct iris_context *) ctx;
3633 memcpy(&ice->state.stencil_ref, &state, sizeof(state));
3634 if (GFX_VER >= 12)
3635 ice->state.dirty |= IRIS_DIRTY_STENCIL_REF;
3636 else if (GFX_VER >= 9)
3637 ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL;
3638 else
3639 ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
3640 }
3641
3642 static float
viewport_extent(const struct pipe_viewport_state * state,int axis,float sign)3643 viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)
3644 {
3645 return copysignf(state->scale[axis], sign) + state->translate[axis];
3646 }
3647
3648 /**
3649 * The pipe->set_viewport_states() driver hook.
3650 *
3651 * This corresponds to our SF_CLIP_VIEWPORT states. We can't calculate
3652 * the guardband yet, as we need the framebuffer dimensions, but we can
3653 * at least fill out the rest.
3654 */
3655 static void
iris_set_viewport_states(struct pipe_context * ctx,unsigned start_slot,unsigned count,const struct pipe_viewport_state * states)3656 iris_set_viewport_states(struct pipe_context *ctx,
3657 unsigned start_slot,
3658 unsigned count,
3659 const struct pipe_viewport_state *states)
3660 {
3661 struct iris_context *ice = (struct iris_context *) ctx;
3662 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3663
3664 memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
3665
3666 /* Fix depth test misrenderings by lowering translated depth range */
3667 if (screen->driconf.lower_depth_range_rate != 1.0f)
3668 ice->state.viewports[start_slot].translate[2] *=
3669 screen->driconf.lower_depth_range_rate;
3670
3671 ice->state.dirty |= IRIS_DIRTY_SF_CL_VIEWPORT;
3672
3673 if (ice->state.cso_rast && (!ice->state.cso_rast->depth_clip_near ||
3674 !ice->state.cso_rast->depth_clip_far))
3675 ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
3676 }
3677
3678 /**
3679 * The pipe->set_framebuffer_state() driver hook.
3680 *
3681 * Sets the current draw FBO, including color render targets, depth,
3682 * and stencil buffers.
3683 */
3684 static void
iris_set_framebuffer_state(struct pipe_context * ctx,const struct pipe_framebuffer_state * state)3685 iris_set_framebuffer_state(struct pipe_context *ctx,
3686 const struct pipe_framebuffer_state *state)
3687 {
3688 struct iris_context *ice = (struct iris_context *) ctx;
3689 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3690 const struct intel_device_info *devinfo = screen->devinfo;
3691 struct isl_device *isl_dev = &screen->isl_dev;
3692 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
3693 struct iris_resource *zres;
3694 struct iris_resource *stencil_res;
3695
3696 unsigned samples = util_framebuffer_get_num_samples(state);
3697 unsigned layers = util_framebuffer_get_num_layers(state);
3698
3699 if (cso->samples != samples) {
3700 ice->state.dirty |= IRIS_DIRTY_MULTISAMPLE;
3701
3702 /* We need to toggle 3DSTATE_PS::32 Pixel Dispatch Enable */
3703 if (GFX_VER >= 9 && (cso->samples == 16 || samples == 16))
3704 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_FS;
3705
3706 /* We may need to emit blend state for Wa_14018912822. */
3707 if ((cso->samples > 1) != (samples > 1) &&
3708 intel_needs_workaround(devinfo, 14018912822)) {
3709 ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
3710 ice->state.dirty |= IRIS_DIRTY_PS_BLEND;
3711 }
3712 }
3713
3714 if (cso->nr_cbufs != state->nr_cbufs) {
3715 ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
3716 }
3717
3718 if ((cso->layers == 0) != (layers == 0)) {
3719 ice->state.dirty |= IRIS_DIRTY_CLIP;
3720 }
3721
3722 if (cso->width != state->width || cso->height != state->height) {
3723 ice->state.dirty |= IRIS_DIRTY_SF_CL_VIEWPORT;
3724 }
3725
3726 if (cso->zsbuf || state->zsbuf) {
3727 ice->state.dirty |= IRIS_DIRTY_DEPTH_BUFFER;
3728 }
3729
3730 bool has_integer_rt = false;
3731 for (unsigned i = 0; i < state->nr_cbufs; i++) {
3732 if (state->cbufs[i]) {
3733 enum isl_format ifmt =
3734 isl_format_for_pipe_format(state->cbufs[i]->format);
3735 has_integer_rt |= isl_format_has_int_channel(ifmt);
3736 }
3737 }
3738
3739 /* 3DSTATE_RASTER::AntialiasingEnable */
3740 if (has_integer_rt != ice->state.has_integer_rt ||
3741 cso->samples != samples) {
3742 ice->state.dirty |= IRIS_DIRTY_RASTER;
3743 }
3744
3745 util_copy_framebuffer_state(cso, state);
3746 cso->samples = samples;
3747 cso->layers = layers;
3748
3749 ice->state.has_integer_rt = has_integer_rt;
3750
3751 struct iris_depth_buffer_state *cso_z = &ice->state.genx->depth_buffer;
3752
3753 struct isl_view view = {
3754 .base_level = 0,
3755 .levels = 1,
3756 .base_array_layer = 0,
3757 .array_len = 1,
3758 .swizzle = ISL_SWIZZLE_IDENTITY,
3759 };
3760
3761 struct isl_depth_stencil_hiz_emit_info info = {
3762 .view = &view,
3763 .mocs = iris_mocs(NULL, isl_dev, ISL_SURF_USAGE_DEPTH_BIT),
3764 };
3765
3766 if (cso->zsbuf) {
3767 iris_get_depth_stencil_resources(cso->zsbuf->texture, &zres,
3768 &stencil_res);
3769
3770 view.base_level = cso->zsbuf->u.tex.level;
3771 view.base_array_layer = cso->zsbuf->u.tex.first_layer;
3772 view.array_len =
3773 cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
3774
3775 if (zres) {
3776 view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
3777
3778 info.depth_surf = &zres->surf;
3779 info.depth_address = zres->bo->address + zres->offset;
3780 info.mocs = iris_mocs(zres->bo, isl_dev, view.usage);
3781
3782 view.format = zres->surf.format;
3783
3784 if (iris_resource_level_has_hiz(devinfo, zres, view.base_level)) {
3785 info.hiz_usage = zres->aux.usage;
3786 info.hiz_surf = &zres->aux.surf;
3787 info.hiz_address = zres->aux.bo->address + zres->aux.offset;
3788 }
3789
3790 ice->state.hiz_usage = info.hiz_usage;
3791 }
3792
3793 if (stencil_res) {
3794 view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
3795 info.stencil_aux_usage = stencil_res->aux.usage;
3796 info.stencil_surf = &stencil_res->surf;
3797 info.stencil_address = stencil_res->bo->address + stencil_res->offset;
3798 if (!zres) {
3799 view.format = stencil_res->surf.format;
3800 info.mocs = iris_mocs(stencil_res->bo, isl_dev, view.usage);
3801 }
3802 }
3803 }
3804
3805 isl_emit_depth_stencil_hiz_s(isl_dev, cso_z->packets, &info);
3806
3807 /* Make a null surface for unbound buffers */
3808 void *null_surf_map =
3809 upload_state(ice->state.surface_uploader, &ice->state.null_fb,
3810 4 * GENX(RENDER_SURFACE_STATE_length), 64);
3811 isl_null_fill_state(&screen->isl_dev, null_surf_map,
3812 .size = isl_extent3d(MAX2(cso->width, 1),
3813 MAX2(cso->height, 1),
3814 cso->layers ? cso->layers : 1));
3815 ice->state.null_fb.offset +=
3816 iris_bo_offset_from_base_address(iris_resource_bo(ice->state.null_fb.res));
3817
3818 /* Render target change */
3819 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_FS;
3820
3821 ice->state.dirty |= IRIS_DIRTY_RENDER_BUFFER;
3822
3823 ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3824
3825 ice->state.stage_dirty |=
3826 ice->state.stage_dirty_for_nos[IRIS_NOS_FRAMEBUFFER];
3827
3828 if (GFX_VER == 8)
3829 ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
3830 }
3831
3832 /**
3833 * The pipe->set_constant_buffer() driver hook.
3834 *
3835 * This uploads any constant data in user buffers, and references
3836 * any UBO resources containing constant data.
3837 */
3838 static void
iris_set_constant_buffer(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned index,bool take_ownership,const struct pipe_constant_buffer * input)3839 iris_set_constant_buffer(struct pipe_context *ctx,
3840 enum pipe_shader_type p_stage, unsigned index,
3841 bool take_ownership,
3842 const struct pipe_constant_buffer *input)
3843 {
3844 struct iris_context *ice = (struct iris_context *) ctx;
3845 gl_shader_stage stage = stage_from_pipe(p_stage);
3846 struct iris_shader_state *shs = &ice->state.shaders[stage];
3847 struct pipe_shader_buffer *cbuf = &shs->constbuf[index];
3848
3849 /* TODO: Only do this if the buffer changes? */
3850 pipe_resource_reference(&shs->constbuf_surf_state[index].res, NULL);
3851
3852 if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
3853 shs->bound_cbufs |= 1u << index;
3854
3855 if (input->user_buffer) {
3856 void *map = NULL;
3857 pipe_resource_reference(&cbuf->buffer, NULL);
3858 u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
3859 &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3860
3861 if (!cbuf->buffer) {
3862 /* Allocation was unsuccessful - just unbind */
3863 iris_set_constant_buffer(ctx, p_stage, index, false, NULL);
3864 return;
3865 }
3866
3867 assert(map);
3868 memcpy(map, input->user_buffer, input->buffer_size);
3869 } else if (input->buffer) {
3870 if (cbuf->buffer != input->buffer) {
3871 ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
3872 IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
3873 shs->dirty_cbufs |= 1u << index;
3874 }
3875
3876 if (take_ownership) {
3877 pipe_resource_reference(&cbuf->buffer, NULL);
3878 cbuf->buffer = input->buffer;
3879 } else {
3880 pipe_resource_reference(&cbuf->buffer, input->buffer);
3881 }
3882
3883 cbuf->buffer_offset = input->buffer_offset;
3884 }
3885
3886 cbuf->buffer_size =
3887 MIN2(input->buffer_size,
3888 iris_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
3889
3890 struct iris_resource *res = (void *) cbuf->buffer;
3891 res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
3892 res->bind_stages |= 1 << stage;
3893 } else {
3894 shs->bound_cbufs &= ~(1u << index);
3895 pipe_resource_reference(&cbuf->buffer, NULL);
3896 }
3897
3898 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << stage;
3899 }
3900
3901 static void
upload_sysvals(struct iris_context * ice,gl_shader_stage stage,const struct pipe_grid_info * grid)3902 upload_sysvals(struct iris_context *ice,
3903 gl_shader_stage stage,
3904 const struct pipe_grid_info *grid)
3905 {
3906 UNUSED struct iris_genx_state *genx = ice->state.genx;
3907 struct iris_shader_state *shs = &ice->state.shaders[stage];
3908
3909 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
3910 if (!shader || (shader->num_system_values == 0 &&
3911 shader->kernel_input_size == 0))
3912 return;
3913
3914 assert(shader->num_cbufs > 0);
3915
3916 unsigned sysval_cbuf_index = shader->num_cbufs - 1;
3917 struct pipe_shader_buffer *cbuf = &shs->constbuf[sysval_cbuf_index];
3918 unsigned system_values_start =
3919 ALIGN(shader->kernel_input_size, sizeof(uint32_t));
3920 unsigned upload_size = system_values_start +
3921 shader->num_system_values * sizeof(uint32_t);
3922 void *map = NULL;
3923
3924 assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
3925 u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
3926 &cbuf->buffer_offset, &cbuf->buffer, &map);
3927
3928 if (shader->kernel_input_size > 0)
3929 memcpy(map, grid->input, shader->kernel_input_size);
3930
3931 uint32_t *sysval_map = map + system_values_start;
3932 for (int i = 0; i < shader->num_system_values; i++) {
3933 uint32_t sysval = shader->system_values[i];
3934 uint32_t value = 0;
3935
3936 #if GFX_VER >= 9
3937 #define COMPILER(x) BRW_##x
3938 #else
3939 #define COMPILER(x) ELK_##x
3940 #endif
3941
3942 if (ELK_PARAM_DOMAIN(sysval) == ELK_PARAM_DOMAIN_IMAGE) {
3943 #if GFX_VER == 8
3944 unsigned img = ELK_PARAM_IMAGE_IDX(sysval);
3945 unsigned offset = ELK_PARAM_IMAGE_OFFSET(sysval);
3946 struct isl_image_param *param =
3947 &genx->shaders[stage].image_param[img];
3948
3949 assert(offset < sizeof(struct isl_image_param));
3950 value = ((uint32_t *) param)[offset];
3951 #endif
3952 } else if (sysval == COMPILER(PARAM_BUILTIN_ZERO)) {
3953 value = 0;
3954 } else if (COMPILER(PARAM_BUILTIN_IS_CLIP_PLANE(sysval))) {
3955 int plane = COMPILER(PARAM_BUILTIN_CLIP_PLANE_IDX(sysval));
3956 int comp = COMPILER(PARAM_BUILTIN_CLIP_PLANE_COMP(sysval));
3957 value = fui(ice->state.clip_planes.ucp[plane][comp]);
3958 } else if (sysval == COMPILER(PARAM_BUILTIN_PATCH_VERTICES_IN)) {
3959 if (stage == MESA_SHADER_TESS_CTRL) {
3960 value = ice->state.vertices_per_patch;
3961 } else {
3962 assert(stage == MESA_SHADER_TESS_EVAL);
3963 const struct shader_info *tcs_info =
3964 iris_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
3965 if (tcs_info)
3966 value = tcs_info->tess.tcs_vertices_out;
3967 else
3968 value = ice->state.vertices_per_patch;
3969 }
3970 } else if (sysval >= COMPILER(PARAM_BUILTIN_TESS_LEVEL_OUTER_X) &&
3971 sysval <= COMPILER(PARAM_BUILTIN_TESS_LEVEL_OUTER_W)) {
3972 unsigned i = sysval - COMPILER(PARAM_BUILTIN_TESS_LEVEL_OUTER_X);
3973 value = fui(ice->state.default_outer_level[i]);
3974 } else if (sysval == COMPILER(PARAM_BUILTIN_TESS_LEVEL_INNER_X)) {
3975 value = fui(ice->state.default_inner_level[0]);
3976 } else if (sysval == COMPILER(PARAM_BUILTIN_TESS_LEVEL_INNER_Y)) {
3977 value = fui(ice->state.default_inner_level[1]);
3978 } else if (sysval >= COMPILER(PARAM_BUILTIN_WORK_GROUP_SIZE_X) &&
3979 sysval <= COMPILER(PARAM_BUILTIN_WORK_GROUP_SIZE_Z)) {
3980 unsigned i = sysval - COMPILER(PARAM_BUILTIN_WORK_GROUP_SIZE_X);
3981 value = ice->state.last_block[i];
3982 } else if (sysval == COMPILER(PARAM_BUILTIN_WORK_DIM)) {
3983 value = grid->work_dim;
3984 } else {
3985 assert(!"unhandled system value");
3986 }
3987
3988 *sysval_map++ = value;
3989 }
3990
3991 cbuf->buffer_size = upload_size;
3992 iris_upload_ubo_ssbo_surf_state(ice, cbuf,
3993 &shs->constbuf_surf_state[sysval_cbuf_index],
3994 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT);
3995
3996 shs->sysvals_need_upload = false;
3997 }
3998
3999 /**
4000 * The pipe->set_shader_buffers() driver hook.
4001 *
4002 * This binds SSBOs and ABOs. Unfortunately, we need to stream out
4003 * SURFACE_STATE here, as the buffer offset may change each time.
4004 */
4005 static void
iris_set_shader_buffers(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)4006 iris_set_shader_buffers(struct pipe_context *ctx,
4007 enum pipe_shader_type p_stage,
4008 unsigned start_slot, unsigned count,
4009 const struct pipe_shader_buffer *buffers,
4010 unsigned writable_bitmask)
4011 {
4012 struct iris_context *ice = (struct iris_context *) ctx;
4013 gl_shader_stage stage = stage_from_pipe(p_stage);
4014 struct iris_shader_state *shs = &ice->state.shaders[stage];
4015
4016 unsigned modified_bits = u_bit_consecutive(start_slot, count);
4017
4018 shs->bound_ssbos &= ~modified_bits;
4019 shs->writable_ssbos &= ~modified_bits;
4020 shs->writable_ssbos |= writable_bitmask << start_slot;
4021
4022 for (unsigned i = 0; i < count; i++) {
4023 if (buffers && buffers[i].buffer) {
4024 struct iris_resource *res = (void *) buffers[i].buffer;
4025 struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
4026 struct iris_state_ref *surf_state =
4027 &shs->ssbo_surf_state[start_slot + i];
4028 pipe_resource_reference(&ssbo->buffer, &res->base.b);
4029 ssbo->buffer_offset = buffers[i].buffer_offset;
4030 ssbo->buffer_size =
4031 MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
4032
4033 shs->bound_ssbos |= 1 << (start_slot + i);
4034
4035 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
4036
4037 iris_upload_ubo_ssbo_surf_state(ice, ssbo, surf_state, usage);
4038
4039 res->bind_history |= PIPE_BIND_SHADER_BUFFER;
4040 res->bind_stages |= 1 << stage;
4041
4042 util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
4043 ssbo->buffer_offset + ssbo->buffer_size);
4044 } else {
4045 pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
4046 pipe_resource_reference(&shs->ssbo_surf_state[start_slot + i].res,
4047 NULL);
4048 }
4049 }
4050
4051 ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
4052 IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
4053 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << stage;
4054 }
4055
4056 static void
iris_delete_state(struct pipe_context * ctx,void * state)4057 iris_delete_state(struct pipe_context *ctx, void *state)
4058 {
4059 free(state);
4060 }
4061
4062 /**
4063 * The pipe->set_vertex_buffers() driver hook.
4064 *
4065 * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
4066 */
4067 static void
iris_set_vertex_buffers(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_buffer * buffers)4068 iris_set_vertex_buffers(struct pipe_context *ctx,
4069 unsigned count,
4070 const struct pipe_vertex_buffer *buffers)
4071 {
4072 struct iris_context *ice = (struct iris_context *) ctx;
4073 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
4074 struct iris_genx_state *genx = ice->state.genx;
4075
4076 unsigned last_count = util_last_bit64(ice->state.bound_vertex_buffers);
4077 ice->state.bound_vertex_buffers = 0;
4078
4079 for (unsigned i = 0; i < count; i++) {
4080 const struct pipe_vertex_buffer *buffer = buffers ? &buffers[i] : NULL;
4081 struct iris_vertex_buffer_state *state =
4082 &genx->vertex_buffers[i];
4083
4084 if (!buffer) {
4085 pipe_resource_reference(&state->resource, NULL);
4086 continue;
4087 }
4088
4089 /* We may see user buffers that are NULL bindings. */
4090 assert(!(buffer->is_user_buffer && buffer->buffer.user != NULL));
4091
4092 if (buffer->buffer.resource &&
4093 state->resource != buffer->buffer.resource)
4094 ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFER_FLUSHES;
4095
4096 pipe_resource_reference(&state->resource, NULL);
4097 state->resource = buffer->buffer.resource;
4098
4099 struct iris_resource *res = (void *) state->resource;
4100
4101 state->offset = (int) buffer->buffer_offset;
4102
4103 if (res) {
4104 ice->state.bound_vertex_buffers |= 1ull << i;
4105 res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
4106 }
4107
4108 iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
4109 vb.VertexBufferIndex = i;
4110 vb.AddressModifyEnable = true;
4111 /* vb.BufferPitch is merged in dynamically from VE state later */
4112 if (res) {
4113 vb.BufferSize = res->base.b.width0 - (int) buffer->buffer_offset;
4114 vb.BufferStartingAddress =
4115 ro_bo(NULL, res->bo->address + (int) buffer->buffer_offset);
4116 vb.MOCS = iris_mocs(res->bo, &screen->isl_dev,
4117 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
4118 #if GFX_VER >= 12
4119 vb.L3BypassDisable = true;
4120 #endif
4121 } else {
4122 vb.NullVertexBuffer = true;
4123 vb.MOCS = iris_mocs(NULL, &screen->isl_dev,
4124 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
4125 }
4126 }
4127 }
4128
4129 for (unsigned i = count; i < last_count; i++) {
4130 struct iris_vertex_buffer_state *state =
4131 &genx->vertex_buffers[i];
4132
4133 pipe_resource_reference(&state->resource, NULL);
4134 }
4135
4136 ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS;
4137 }
4138
4139 /**
4140 * Gallium CSO for vertex elements.
4141 */
4142 struct iris_vertex_element_state {
4143 uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
4144 uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];
4145 uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
4146 uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)];
4147 uint32_t stride[PIPE_MAX_ATTRIBS];
4148 unsigned vb_count;
4149 unsigned count;
4150 };
4151
4152 /**
4153 * The pipe->create_vertex_elements_state() driver hook.
4154 *
4155 * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
4156 * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
4157 * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
4158 * needed. In these cases we will need information available at draw time.
4159 * We setup edgeflag_ve and edgeflag_vfi as alternatives last
4160 * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
4161 * draw time if we detect that EdgeFlag is needed by the Vertex Shader.
4162 */
4163 static void *
iris_create_vertex_elements(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * state)4164 iris_create_vertex_elements(struct pipe_context *ctx,
4165 unsigned count,
4166 const struct pipe_vertex_element *state)
4167 {
4168 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
4169 const struct intel_device_info *devinfo = screen->devinfo;
4170 struct iris_vertex_element_state *cso =
4171 calloc(1, sizeof(struct iris_vertex_element_state));
4172
4173 cso->count = count;
4174 cso->vb_count = 0;
4175
4176 iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
4177 ve.DWordLength =
4178 1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
4179 }
4180
4181 uint32_t *ve_pack_dest = &cso->vertex_elements[1];
4182 uint32_t *vfi_pack_dest = cso->vf_instancing;
4183
4184 if (count == 0) {
4185 iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
4186 ve.Valid = true;
4187 ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
4188 ve.Component0Control = VFCOMP_STORE_0;
4189 ve.Component1Control = VFCOMP_STORE_0;
4190 ve.Component2Control = VFCOMP_STORE_0;
4191 ve.Component3Control = VFCOMP_STORE_1_FP;
4192 }
4193
4194 iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
4195 }
4196 }
4197
4198 for (int i = 0; i < count; i++) {
4199 const struct iris_format_info fmt =
4200 iris_format_for_usage(devinfo, state[i].src_format, 0);
4201 unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
4202 VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
4203
4204 switch (isl_format_get_num_channels(fmt.fmt)) {
4205 case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
4206 case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
4207 case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
4208 case 3:
4209 comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
4210 : VFCOMP_STORE_1_FP;
4211 break;
4212 }
4213 iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
4214 ve.EdgeFlagEnable = false;
4215 ve.VertexBufferIndex = state[i].vertex_buffer_index;
4216 ve.Valid = true;
4217 ve.SourceElementOffset = state[i].src_offset;
4218 ve.SourceElementFormat = fmt.fmt;
4219 ve.Component0Control = comp[0];
4220 ve.Component1Control = comp[1];
4221 ve.Component2Control = comp[2];
4222 ve.Component3Control = comp[3];
4223 }
4224
4225 iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
4226 vi.VertexElementIndex = i;
4227 vi.InstancingEnable = state[i].instance_divisor > 0;
4228 vi.InstanceDataStepRate = state[i].instance_divisor;
4229 }
4230
4231 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
4232 vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);
4233 cso->stride[state[i].vertex_buffer_index] = state[i].src_stride;
4234 cso->vb_count = MAX2(state[i].vertex_buffer_index + 1, cso->vb_count);
4235 }
4236
4237 /* An alternative version of the last VE and VFI is stored so it
4238 * can be used at draw time in case Vertex Shader uses EdgeFlag
4239 */
4240 if (count) {
4241 const unsigned edgeflag_index = count - 1;
4242 const struct iris_format_info fmt =
4243 iris_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
4244 iris_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
4245 ve.EdgeFlagEnable = true ;
4246 ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
4247 ve.Valid = true;
4248 ve.SourceElementOffset = state[edgeflag_index].src_offset;
4249 ve.SourceElementFormat = fmt.fmt;
4250 ve.Component0Control = VFCOMP_STORE_SRC;
4251 ve.Component1Control = VFCOMP_STORE_0;
4252 ve.Component2Control = VFCOMP_STORE_0;
4253 ve.Component3Control = VFCOMP_STORE_0;
4254 }
4255 iris_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) {
4256 /* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled
4257 * at draw time, as it should change if SGVs are emitted.
4258 */
4259 vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0;
4260 vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor;
4261 }
4262 }
4263
4264 return cso;
4265 }
4266
4267 /**
4268 * The pipe->bind_vertex_elements_state() driver hook.
4269 */
4270 static void
iris_bind_vertex_elements_state(struct pipe_context * ctx,void * state)4271 iris_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
4272 {
4273 struct iris_context *ice = (struct iris_context *) ctx;
4274 struct iris_vertex_element_state *old_cso = ice->state.cso_vertex_elements;
4275 struct iris_vertex_element_state *new_cso = state;
4276
4277 /* 3DSTATE_VF_SGVs overrides the last VE, so if the count is changing,
4278 * we need to re-emit it to ensure we're overriding the right one.
4279 */
4280 if (new_cso && cso_changed(count))
4281 ice->state.dirty |= IRIS_DIRTY_VF_SGVS;
4282
4283 ice->state.cso_vertex_elements = state;
4284 ice->state.dirty |= IRIS_DIRTY_VERTEX_ELEMENTS;
4285 if (new_cso) {
4286 /* re-emit vertex buffer state if stride changes */
4287 if (cso_changed(vb_count) ||
4288 cso_changed_memcmp_elts(stride, new_cso->vb_count))
4289 ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS;
4290 }
4291 }
4292
4293 /**
4294 * The pipe->create_stream_output_target() driver hook.
4295 *
4296 * "Target" here refers to a destination buffer. We translate this into
4297 * a 3DSTATE_SO_BUFFER packet. We can handle most fields, but don't yet
4298 * know which buffer this represents, or whether we ought to zero the
4299 * write-offsets, or append. Those are handled in the set() hook.
4300 */
4301 static struct pipe_stream_output_target *
iris_create_stream_output_target(struct pipe_context * ctx,struct pipe_resource * p_res,unsigned buffer_offset,unsigned buffer_size)4302 iris_create_stream_output_target(struct pipe_context *ctx,
4303 struct pipe_resource *p_res,
4304 unsigned buffer_offset,
4305 unsigned buffer_size)
4306 {
4307 struct iris_resource *res = (void *) p_res;
4308 struct iris_stream_output_target *cso = calloc(1, sizeof(*cso));
4309 if (!cso)
4310 return NULL;
4311
4312 res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
4313
4314 pipe_reference_init(&cso->base.reference, 1);
4315 pipe_resource_reference(&cso->base.buffer, p_res);
4316 cso->base.buffer_offset = buffer_offset;
4317 cso->base.buffer_size = buffer_size;
4318 cso->base.context = ctx;
4319
4320 util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset,
4321 buffer_offset + buffer_size);
4322
4323 return &cso->base;
4324 }
4325
4326 static void
iris_stream_output_target_destroy(struct pipe_context * ctx,struct pipe_stream_output_target * state)4327 iris_stream_output_target_destroy(struct pipe_context *ctx,
4328 struct pipe_stream_output_target *state)
4329 {
4330 struct iris_stream_output_target *cso = (void *) state;
4331
4332 pipe_resource_reference(&cso->base.buffer, NULL);
4333 pipe_resource_reference(&cso->offset.res, NULL);
4334
4335 free(cso);
4336 }
4337
4338 /**
4339 * The pipe->set_stream_output_targets() driver hook.
4340 *
4341 * At this point, we know which targets are bound to a particular index,
4342 * and also whether we want to append or start over. We can finish the
4343 * 3DSTATE_SO_BUFFER packets we started earlier.
4344 */
4345 static void
iris_set_stream_output_targets(struct pipe_context * ctx,unsigned num_targets,struct pipe_stream_output_target ** targets,const unsigned * offsets)4346 iris_set_stream_output_targets(struct pipe_context *ctx,
4347 unsigned num_targets,
4348 struct pipe_stream_output_target **targets,
4349 const unsigned *offsets)
4350 {
4351 struct iris_context *ice = (struct iris_context *) ctx;
4352 struct iris_genx_state *genx = ice->state.genx;
4353 uint32_t *so_buffers = genx->so_buffers;
4354 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
4355
4356 const bool active = num_targets > 0;
4357 if (ice->state.streamout_active != active) {
4358 ice->state.streamout_active = active;
4359 ice->state.dirty |= IRIS_DIRTY_STREAMOUT;
4360
4361 /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
4362 * it's a non-pipelined command. If we're switching streamout on, we
4363 * may have missed emitting it earlier, so do so now. (We're already
4364 * taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
4365 */
4366 if (active) {
4367 ice->state.dirty |= IRIS_DIRTY_SO_DECL_LIST;
4368 } else {
4369 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4370 struct iris_stream_output_target *tgt =
4371 (void *) ice->state.so_target[i];
4372
4373 if (tgt)
4374 iris_dirty_for_history(ice, (void *)tgt->base.buffer);
4375 }
4376 }
4377 }
4378
4379 for (int i = 0; i < 4; i++) {
4380 pipe_so_target_reference(&ice->state.so_target[i],
4381 i < num_targets ? targets[i] : NULL);
4382 }
4383
4384 /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
4385 if (!active)
4386 return;
4387
4388 for (unsigned i = 0; i < 4; i++,
4389 so_buffers += GENX(3DSTATE_SO_BUFFER_length)) {
4390
4391 struct iris_stream_output_target *tgt = (void *) ice->state.so_target[i];
4392 unsigned offset = offsets[i];
4393
4394 if (!tgt) {
4395 iris_pack_command(GENX(3DSTATE_SO_BUFFER), so_buffers, sob) {
4396 #if GFX_VER < 12
4397 sob.SOBufferIndex = i;
4398 #else
4399 sob._3DCommandOpcode = 0;
4400 sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + i;
4401 #endif
4402 sob.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
4403 }
4404 continue;
4405 }
4406
4407 if (!tgt->offset.res)
4408 upload_state(ctx->const_uploader, &tgt->offset, sizeof(uint32_t), 4);
4409
4410 struct iris_resource *res = (void *) tgt->base.buffer;
4411
4412 /* Note that offsets[i] will either be 0, causing us to zero
4413 * the value in the buffer, or 0xFFFFFFFF, which happens to mean
4414 * "continue appending at the existing offset."
4415 */
4416 assert(offset == 0 || offset == 0xFFFFFFFF);
4417
4418 /* When we're first called with an offset of 0, we want the next
4419 * 3DSTATE_SO_BUFFER packets to reset the offset to the beginning.
4420 * Any further times we emit those packets, we want to use 0xFFFFFFFF
4421 * to continue appending from the current offset.
4422 *
4423 * Note that we might be called by Begin (offset = 0), Pause, then
4424 * Resume (offset = 0xFFFFFFFF) before ever drawing (where these
4425 * commands will actually be sent to the GPU). In this case, we
4426 * don't want to append - we still want to do our initial zeroing.
4427 */
4428 if (offset == 0)
4429 tgt->zero_offset = true;
4430
4431 iris_pack_command(GENX(3DSTATE_SO_BUFFER), so_buffers, sob) {
4432 #if GFX_VER < 12
4433 sob.SOBufferIndex = i;
4434 #else
4435 sob._3DCommandOpcode = 0;
4436 sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + i;
4437 #endif
4438 sob.SurfaceBaseAddress =
4439 rw_bo(NULL, res->bo->address + tgt->base.buffer_offset,
4440 IRIS_DOMAIN_OTHER_WRITE);
4441 sob.SOBufferEnable = true;
4442 sob.StreamOffsetWriteEnable = true;
4443 sob.StreamOutputBufferOffsetAddressEnable = true;
4444 sob.MOCS = iris_mocs(res->bo, &screen->isl_dev,
4445 ISL_SURF_USAGE_STREAM_OUT_BIT);
4446
4447 sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1;
4448 sob.StreamOutputBufferOffsetAddress =
4449 rw_bo(NULL, iris_resource_bo(tgt->offset.res)->address +
4450 tgt->offset.offset, IRIS_DOMAIN_OTHER_WRITE);
4451 sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */
4452 }
4453 }
4454
4455 ice->state.dirty |= IRIS_DIRTY_SO_BUFFERS;
4456 }
4457
4458 /**
4459 * An iris-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
4460 * 3DSTATE_STREAMOUT packets.
4461 *
4462 * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
4463 * hardware to record. We can create it entirely based on the shader, with
4464 * no dynamic state dependencies.
4465 *
4466 * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
4467 * state-based settings. We capture the shader-related ones here, and merge
4468 * the rest in at draw time.
4469 */
4470 static uint32_t *
iris_create_so_decl_list(const struct pipe_stream_output_info * info,const struct intel_vue_map * vue_map)4471 iris_create_so_decl_list(const struct pipe_stream_output_info *info,
4472 const struct intel_vue_map *vue_map)
4473 {
4474 struct GENX(SO_DECL) so_decl[PIPE_MAX_VERTEX_STREAMS][128];
4475 int buffer_mask[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4476 int next_offset[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4477 int decls[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4478 int max_decls = 0;
4479 STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= PIPE_MAX_SO_OUTPUTS);
4480
4481 memset(so_decl, 0, sizeof(so_decl));
4482
4483 /* Construct the list of SO_DECLs to be emitted. The formatting of the
4484 * command feels strange -- each dword pair contains a SO_DECL per stream.
4485 */
4486 for (unsigned i = 0; i < info->num_outputs; i++) {
4487 const struct pipe_stream_output *output = &info->output[i];
4488 const int buffer = output->output_buffer;
4489 const int varying = output->register_index;
4490 const unsigned stream_id = output->stream;
4491 assert(stream_id < PIPE_MAX_VERTEX_STREAMS);
4492
4493 buffer_mask[stream_id] |= 1 << buffer;
4494
4495 assert(vue_map->varying_to_slot[varying] >= 0);
4496
4497 /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
4498 * array. Instead, it simply increments DstOffset for the following
4499 * input by the number of components that should be skipped.
4500 *
4501 * Our hardware is unusual in that it requires us to program SO_DECLs
4502 * for fake "hole" components, rather than simply taking the offset
4503 * for each real varying. Each hole can have size 1, 2, 3, or 4; we
4504 * program as many size = 4 holes as we can, then a final hole to
4505 * accommodate the final 1, 2, or 3 remaining.
4506 */
4507 int skip_components = output->dst_offset - next_offset[buffer];
4508
4509 while (skip_components > 0) {
4510 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4511 .HoleFlag = 1,
4512 .OutputBufferSlot = output->output_buffer,
4513 .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
4514 };
4515 skip_components -= 4;
4516 }
4517
4518 next_offset[buffer] = output->dst_offset + output->num_components;
4519
4520 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4521 .OutputBufferSlot = output->output_buffer,
4522 .RegisterIndex = vue_map->varying_to_slot[varying],
4523 .ComponentMask =
4524 ((1 << output->num_components) - 1) << output->start_component,
4525 };
4526
4527 if (decls[stream_id] > max_decls)
4528 max_decls = decls[stream_id];
4529 }
4530
4531 unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
4532 uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
4533 uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
4534
4535 iris_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
4536 int urb_entry_read_offset = 0;
4537 int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
4538 urb_entry_read_offset;
4539
4540 /* We always read the whole vertex. This could be reduced at some
4541 * point by reading less and offsetting the register index in the
4542 * SO_DECLs.
4543 */
4544 sol.Stream0VertexReadOffset = urb_entry_read_offset;
4545 sol.Stream0VertexReadLength = urb_entry_read_length - 1;
4546 sol.Stream1VertexReadOffset = urb_entry_read_offset;
4547 sol.Stream1VertexReadLength = urb_entry_read_length - 1;
4548 sol.Stream2VertexReadOffset = urb_entry_read_offset;
4549 sol.Stream2VertexReadLength = urb_entry_read_length - 1;
4550 sol.Stream3VertexReadOffset = urb_entry_read_offset;
4551 sol.Stream3VertexReadLength = urb_entry_read_length - 1;
4552
4553 /* Set buffer pitches; 0 means unbound. */
4554 sol.Buffer0SurfacePitch = 4 * info->stride[0];
4555 sol.Buffer1SurfacePitch = 4 * info->stride[1];
4556 sol.Buffer2SurfacePitch = 4 * info->stride[2];
4557 sol.Buffer3SurfacePitch = 4 * info->stride[3];
4558 }
4559
4560 iris_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
4561 list.DWordLength = 3 + 2 * max_decls - 2;
4562 list.StreamtoBufferSelects0 = buffer_mask[0];
4563 list.StreamtoBufferSelects1 = buffer_mask[1];
4564 list.StreamtoBufferSelects2 = buffer_mask[2];
4565 list.StreamtoBufferSelects3 = buffer_mask[3];
4566 list.NumEntries0 = decls[0];
4567 list.NumEntries1 = decls[1];
4568 list.NumEntries2 = decls[2];
4569 list.NumEntries3 = decls[3];
4570 }
4571
4572 for (int i = 0; i < max_decls; i++) {
4573 iris_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
4574 entry.Stream0Decl = so_decl[0][i];
4575 entry.Stream1Decl = so_decl[1][i];
4576 entry.Stream2Decl = so_decl[2][i];
4577 entry.Stream3Decl = so_decl[3][i];
4578 }
4579 }
4580
4581 return map;
4582 }
4583
4584 static inline int
iris_compute_first_urb_slot_required(uint64_t inputs_read,const struct intel_vue_map * prev_stage_vue_map)4585 iris_compute_first_urb_slot_required(uint64_t inputs_read,
4586 const struct intel_vue_map *prev_stage_vue_map)
4587 {
4588 #if GFX_VER >= 9
4589 return brw_compute_first_urb_slot_required(inputs_read, prev_stage_vue_map);
4590 #else
4591 return elk_compute_first_urb_slot_required(inputs_read, prev_stage_vue_map);
4592 #endif
4593 }
4594
4595 static void
iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots,const struct intel_vue_map * last_vue_map,bool two_sided_color,unsigned * out_offset,unsigned * out_length)4596 iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots,
4597 const struct intel_vue_map *last_vue_map,
4598 bool two_sided_color,
4599 unsigned *out_offset,
4600 unsigned *out_length)
4601 {
4602 /* The compiler computes the first URB slot without considering COL/BFC
4603 * swizzling (because it doesn't know whether it's enabled), so we need
4604 * to do that here too. This may result in a smaller offset, which
4605 * should be safe.
4606 */
4607 const unsigned first_slot =
4608 iris_compute_first_urb_slot_required(fs_input_slots, last_vue_map);
4609
4610 /* This becomes the URB read offset (counted in pairs of slots). */
4611 assert(first_slot % 2 == 0);
4612 *out_offset = first_slot / 2;
4613
4614 /* We need to adjust the inputs read to account for front/back color
4615 * swizzling, as it can make the URB length longer.
4616 */
4617 for (int c = 0; c <= 1; c++) {
4618 if (fs_input_slots & (VARYING_BIT_COL0 << c)) {
4619 /* If two sided color is enabled, the fragment shader's gl_Color
4620 * (COL0) input comes from either the gl_FrontColor (COL0) or
4621 * gl_BackColor (BFC0) input varyings. Mark BFC as used, too.
4622 */
4623 if (two_sided_color)
4624 fs_input_slots |= (VARYING_BIT_BFC0 << c);
4625
4626 /* If front color isn't written, we opt to give them back color
4627 * instead of an undefined value. Switch from COL to BFC.
4628 */
4629 if (last_vue_map->varying_to_slot[VARYING_SLOT_COL0 + c] == -1) {
4630 fs_input_slots &= ~(VARYING_BIT_COL0 << c);
4631 fs_input_slots |= (VARYING_BIT_BFC0 << c);
4632 }
4633 }
4634 }
4635
4636 /* Compute the minimum URB Read Length necessary for the FS inputs.
4637 *
4638 * From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
4639 * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
4640 *
4641 * "This field should be set to the minimum length required to read the
4642 * maximum source attribute. The maximum source attribute is indicated
4643 * by the maximum value of the enabled Attribute # Source Attribute if
4644 * Attribute Swizzle Enable is set, Number of Output Attributes-1 if
4645 * enable is not set.
4646 * read_length = ceiling((max_source_attr + 1) / 2)
4647 *
4648 * [errata] Corruption/Hang possible if length programmed larger than
4649 * recommended"
4650 *
4651 * Similar text exists for Ivy Bridge.
4652 *
4653 * We find the last URB slot that's actually read by the FS.
4654 */
4655 unsigned last_read_slot = last_vue_map->num_slots - 1;
4656 while (last_read_slot > first_slot && !(fs_input_slots &
4657 (1ull << last_vue_map->slot_to_varying[last_read_slot])))
4658 --last_read_slot;
4659
4660 /* The URB read length is the difference of the two, counted in pairs. */
4661 *out_length = DIV_ROUND_UP(last_read_slot - first_slot + 1, 2);
4662 }
4663
4664 static void
iris_emit_sbe_swiz(struct iris_batch * batch,const struct iris_context * ice,const struct intel_vue_map * vue_map,unsigned urb_read_offset,unsigned sprite_coord_enables)4665 iris_emit_sbe_swiz(struct iris_batch *batch,
4666 const struct iris_context *ice,
4667 const struct intel_vue_map *vue_map,
4668 unsigned urb_read_offset,
4669 unsigned sprite_coord_enables)
4670 {
4671 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = {};
4672 const struct iris_fs_data *fs_data =
4673 iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
4674 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4675
4676 /* XXX: this should be generated when putting programs in place */
4677
4678 for (uint8_t idx = 0; idx < fs_data->urb_setup_attribs_count; idx++) {
4679 const uint8_t fs_attr = fs_data->urb_setup_attribs[idx];
4680 const int input_index = fs_data->urb_setup[fs_attr];
4681 if (input_index < 0 || input_index >= 16)
4682 continue;
4683
4684 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr =
4685 &attr_overrides[input_index];
4686 int slot = vue_map->varying_to_slot[fs_attr];
4687
4688 /* Viewport and Layer are stored in the VUE header. We need to override
4689 * them to zero if earlier stages didn't write them, as GL requires that
4690 * they read back as zero when not explicitly set.
4691 */
4692 switch (fs_attr) {
4693 case VARYING_SLOT_VIEWPORT:
4694 case VARYING_SLOT_LAYER:
4695 attr->ComponentOverrideX = true;
4696 attr->ComponentOverrideW = true;
4697 attr->ConstantSource = CONST_0000;
4698
4699 if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
4700 attr->ComponentOverrideY = true;
4701 if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
4702 attr->ComponentOverrideZ = true;
4703 continue;
4704
4705 default:
4706 break;
4707 }
4708
4709 if (sprite_coord_enables & (1 << input_index))
4710 continue;
4711
4712 /* If there was only a back color written but not front, use back
4713 * as the color instead of undefined.
4714 */
4715 if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
4716 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
4717 if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
4718 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
4719
4720 /* Not written by the previous stage - undefined. */
4721 if (slot == -1) {
4722 attr->ComponentOverrideX = true;
4723 attr->ComponentOverrideY = true;
4724 attr->ComponentOverrideZ = true;
4725 attr->ComponentOverrideW = true;
4726 attr->ConstantSource = CONST_0001_FLOAT;
4727 continue;
4728 }
4729
4730 /* Compute the location of the attribute relative to the read offset,
4731 * which is counted in 256-bit increments (two 128-bit VUE slots).
4732 */
4733 const int source_attr = slot - 2 * urb_read_offset;
4734 assert(source_attr >= 0 && source_attr <= 32);
4735 attr->SourceAttribute = source_attr;
4736
4737 /* If we are doing two-sided color, and the VUE slot following this one
4738 * represents a back-facing color, then we need to instruct the SF unit
4739 * to do back-facing swizzling.
4740 */
4741 if (cso_rast->light_twoside &&
4742 ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
4743 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
4744 (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
4745 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1)))
4746 attr->SwizzleSelect = INPUTATTR_FACING;
4747 }
4748
4749 iris_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {
4750 for (int i = 0; i < 16; i++)
4751 sbes.Attribute[i] = attr_overrides[i];
4752 }
4753 }
4754
4755 static bool
iris_is_drawing_points(const struct iris_context * ice)4756 iris_is_drawing_points(const struct iris_context *ice)
4757 {
4758 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4759
4760 if (cso_rast->fill_mode_point) {
4761 return true;
4762 }
4763
4764 if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
4765 const struct iris_gs_data *gs_data =
4766 iris_gs_data(ice->shaders.prog[MESA_SHADER_GEOMETRY]);
4767 return gs_data->output_topology == _3DPRIM_POINTLIST;
4768 } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
4769 const struct iris_tes_data *tes_data =
4770 iris_tes_data(ice->shaders.prog[MESA_SHADER_TESS_EVAL]);
4771 return tes_data->output_topology == INTEL_TESS_OUTPUT_TOPOLOGY_POINT;
4772 } else {
4773 return ice->state.prim_mode == MESA_PRIM_POINTS;
4774 }
4775 }
4776
4777 static unsigned
iris_calculate_point_sprite_overrides(const struct iris_fs_data * fs_data,const struct iris_rasterizer_state * cso)4778 iris_calculate_point_sprite_overrides(const struct iris_fs_data *fs_data,
4779 const struct iris_rasterizer_state *cso)
4780 {
4781 unsigned overrides = 0;
4782
4783 if (fs_data->urb_setup[VARYING_SLOT_PNTC] != -1)
4784 overrides |= 1 << fs_data->urb_setup[VARYING_SLOT_PNTC];
4785
4786 for (int i = 0; i < 8; i++) {
4787 if ((cso->sprite_coord_enable & (1 << i)) &&
4788 fs_data->urb_setup[VARYING_SLOT_TEX0 + i] != -1)
4789 overrides |= 1 << fs_data->urb_setup[VARYING_SLOT_TEX0 + i];
4790 }
4791
4792 return overrides;
4793 }
4794
4795 static void
iris_emit_sbe(struct iris_batch * batch,const struct iris_context * ice)4796 iris_emit_sbe(struct iris_batch *batch, const struct iris_context *ice)
4797 {
4798 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4799 const struct iris_fs_data *fs_data =
4800 iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
4801 const struct intel_vue_map *last_vue_map =
4802 &iris_vue_data(ice->shaders.last_vue_shader)->vue_map;
4803
4804 unsigned urb_read_offset, urb_read_length;
4805 iris_compute_sbe_urb_read_interval(fs_data->inputs,
4806 last_vue_map,
4807 cso_rast->light_twoside,
4808 &urb_read_offset, &urb_read_length);
4809
4810 unsigned sprite_coord_overrides =
4811 iris_is_drawing_points(ice) ?
4812 iris_calculate_point_sprite_overrides(fs_data, cso_rast) : 0;
4813
4814 iris_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
4815 sbe.AttributeSwizzleEnable = true;
4816 sbe.NumberofSFOutputAttributes = fs_data->num_varying_inputs;
4817 sbe.PointSpriteTextureCoordinateOrigin = cso_rast->sprite_coord_mode;
4818 sbe.VertexURBEntryReadOffset = urb_read_offset;
4819 sbe.VertexURBEntryReadLength = urb_read_length;
4820 sbe.ForceVertexURBEntryReadOffset = true;
4821 sbe.ForceVertexURBEntryReadLength = true;
4822 sbe.ConstantInterpolationEnable = fs_data->flat_inputs;
4823 sbe.PointSpriteTextureCoordinateEnable = sprite_coord_overrides;
4824 #if GFX_VER >= 9
4825 for (int i = 0; i < 32; i++) {
4826 sbe.AttributeActiveComponentFormat[i] = ACTIVE_COMPONENT_XYZW;
4827 }
4828 #endif
4829
4830 /* Ask the hardware to supply PrimitiveID if the fragment shader
4831 * reads it but a previous stage didn't write one.
4832 */
4833 if ((fs_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
4834 last_vue_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1) {
4835 sbe.PrimitiveIDOverrideAttributeSelect =
4836 fs_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
4837 sbe.PrimitiveIDOverrideComponentX = true;
4838 sbe.PrimitiveIDOverrideComponentY = true;
4839 sbe.PrimitiveIDOverrideComponentZ = true;
4840 sbe.PrimitiveIDOverrideComponentW = true;
4841 }
4842 }
4843
4844 iris_emit_sbe_swiz(batch, ice, last_vue_map, urb_read_offset,
4845 sprite_coord_overrides);
4846 }
4847
4848 /* ------------------------------------------------------------------- */
4849
4850 /**
4851 * Populate VS program key fields based on the current state.
4852 */
4853 static void
iris_populate_vs_key(const struct iris_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct iris_vs_prog_key * key)4854 iris_populate_vs_key(const struct iris_context *ice,
4855 const struct shader_info *info,
4856 gl_shader_stage last_stage,
4857 struct iris_vs_prog_key *key)
4858 {
4859 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4860
4861 if (info->clip_distance_array_size == 0 &&
4862 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4863 last_stage == MESA_SHADER_VERTEX)
4864 key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4865 }
4866
4867 /**
4868 * Populate TCS program key fields based on the current state.
4869 */
4870 static void
iris_populate_tcs_key(const struct iris_context * ice,struct iris_tcs_prog_key * key)4871 iris_populate_tcs_key(const struct iris_context *ice,
4872 struct iris_tcs_prog_key *key)
4873 {
4874 }
4875
4876 /**
4877 * Populate TES program key fields based on the current state.
4878 */
4879 static void
iris_populate_tes_key(const struct iris_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct iris_tes_prog_key * key)4880 iris_populate_tes_key(const struct iris_context *ice,
4881 const struct shader_info *info,
4882 gl_shader_stage last_stage,
4883 struct iris_tes_prog_key *key)
4884 {
4885 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4886
4887 if (info->clip_distance_array_size == 0 &&
4888 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4889 last_stage == MESA_SHADER_TESS_EVAL)
4890 key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4891 }
4892
4893 /**
4894 * Populate GS program key fields based on the current state.
4895 */
4896 static void
iris_populate_gs_key(const struct iris_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct iris_gs_prog_key * key)4897 iris_populate_gs_key(const struct iris_context *ice,
4898 const struct shader_info *info,
4899 gl_shader_stage last_stage,
4900 struct iris_gs_prog_key *key)
4901 {
4902 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4903
4904 if (info->clip_distance_array_size == 0 &&
4905 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4906 last_stage == MESA_SHADER_GEOMETRY)
4907 key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4908 }
4909
4910 /**
4911 * Populate FS program key fields based on the current state.
4912 */
4913 static void
iris_populate_fs_key(const struct iris_context * ice,const struct shader_info * info,struct iris_fs_prog_key * key)4914 iris_populate_fs_key(const struct iris_context *ice,
4915 const struct shader_info *info,
4916 struct iris_fs_prog_key *key)
4917 {
4918 struct iris_screen *screen = (void *) ice->ctx.screen;
4919 const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
4920 const struct iris_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
4921 const struct iris_rasterizer_state *rast = ice->state.cso_rast;
4922 const struct iris_blend_state *blend = ice->state.cso_blend;
4923
4924 key->nr_color_regions = fb->nr_cbufs;
4925
4926 key->clamp_fragment_color = rast->clamp_fragment_color;
4927
4928 key->alpha_to_coverage = blend->alpha_to_coverage;
4929
4930 key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->alpha_enabled;
4931
4932 key->flat_shade = rast->flatshade &&
4933 (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
4934
4935 key->persample_interp = rast->force_persample_interp;
4936 key->multisample_fbo = rast->multisample && fb->samples > 1;
4937
4938 key->coherent_fb_fetch = GFX_VER >= 9 && GFX_VER < 20;
4939
4940 key->force_dual_color_blend =
4941 screen->driconf.dual_color_blend_by_location &&
4942 (blend->blend_enables & 1) && blend->dual_color_blending;
4943 }
4944
4945 static void
iris_populate_cs_key(const struct iris_context * ice,struct iris_cs_prog_key * key)4946 iris_populate_cs_key(const struct iris_context *ice,
4947 struct iris_cs_prog_key *key)
4948 {
4949 }
4950
4951 static inline uint32_t
encode_sampler_count(const struct iris_compiled_shader * shader)4952 encode_sampler_count(const struct iris_compiled_shader *shader)
4953 {
4954 /* We can potentially have way more than 32 samplers and that's ok.
4955 * However, the 3DSTATE_XS packets only have 3 bits to specify how
4956 * many to pre-fetch and all values above 4 are marked reserved.
4957 */
4958 uint32_t count = util_last_bit64(shader->bt.samplers_used_mask);
4959 return DIV_ROUND_UP(CLAMP(count, 0, 16), 4);
4960 }
4961
4962 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \
4963 pkt.KernelStartPointer = KSP(shader); \
4964 pkt.BindingTableEntryCount = shader->bt.size_bytes / 4; \
4965 pkt.SamplerCount = encode_sampler_count(shader); \
4966 pkt.FloatingPointMode = shader->use_alt_mode; \
4967 \
4968 pkt.DispatchGRFStartRegisterForURBData = \
4969 shader->dispatch_grf_start_reg; \
4970 pkt.prefix##URBEntryReadLength = vue_data->urb_read_length; \
4971 pkt.prefix##URBEntryReadOffset = 0; \
4972 \
4973 pkt.StatisticsEnable = true; \
4974 pkt.Enable = true; \
4975 \
4976 if (shader->total_scratch) { \
4977 INIT_THREAD_SCRATCH_SIZE(pkt) \
4978 }
4979
4980 /* Note that on Gfx12HP we pass a scratch space surface state offset
4981 * shifted by 2 relative to the value specified on the BSpec, since
4982 * that allows the compiler to save a shift instruction while
4983 * constructing the extended descriptor for SS addressing. That
4984 * worked because we limit the scratch surface state pool to 8 MB and
4985 * because we relied on the legacy (ExBSO=0) encoding of the extended
4986 * descriptor in order to save the shift, which is no longer supported
4987 * for the UGM shared function on Xe2 platforms, so we no longer
4988 * attempt to do that trick.
4989 */
4990 #define SCRATCH_SPACE_BUFFER_SHIFT (GFX_VER >= 20 ? 6 : 4)
4991
4992 #if GFX_VERx10 >= 125
4993 #define INIT_THREAD_SCRATCH_SIZE(pkt)
4994 #define MERGE_SCRATCH_ADDR(name) \
4995 { \
4996 uint32_t pkt2[GENX(name##_length)] = {0}; \
4997 _iris_pack_command(batch, GENX(name), pkt2, p) { \
4998 p.ScratchSpaceBuffer = scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT; \
4999 } \
5000 iris_emit_merge(batch, pkt, pkt2, GENX(name##_length)); \
5001 }
5002 #else
5003 #define INIT_THREAD_SCRATCH_SIZE(pkt) \
5004 pkt.PerThreadScratchSpace = ffs(shader->total_scratch) - 11;
5005 #define MERGE_SCRATCH_ADDR(name) \
5006 { \
5007 uint32_t pkt2[GENX(name##_length)] = {0}; \
5008 _iris_pack_command(batch, GENX(name), pkt2, p) { \
5009 p.ScratchSpaceBasePointer = \
5010 rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE); \
5011 } \
5012 iris_emit_merge(batch, pkt, pkt2, GENX(name##_length)); \
5013 }
5014 #endif
5015
5016
5017 /**
5018 * Encode most of 3DSTATE_VS based on the compiled shader.
5019 */
5020 static void
iris_store_vs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5021 iris_store_vs_state(const struct intel_device_info *devinfo,
5022 struct iris_compiled_shader *shader)
5023 {
5024 struct iris_vue_data *vue_data = iris_vue_data(shader);
5025
5026 iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) {
5027 INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
5028 vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
5029 #if GFX_VER < 20
5030 vs.SIMD8DispatchEnable = true;
5031 #endif
5032 vs.UserClipDistanceCullTestEnableBitmask =
5033 vue_data->cull_distance_mask;
5034 }
5035 }
5036
5037 /**
5038 * Encode most of 3DSTATE_HS based on the compiled shader.
5039 */
5040 static void
iris_store_tcs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5041 iris_store_tcs_state(const struct intel_device_info *devinfo,
5042 struct iris_compiled_shader *shader)
5043 {
5044 struct iris_tcs_data *tcs_data = iris_tcs_data(shader);
5045 struct iris_vue_data *vue_data = &tcs_data->base;
5046
5047 iris_pack_command(GENX(3DSTATE_HS), shader->derived_data, hs) {
5048 INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
5049
5050 #if GFX_VER >= 12
5051 /* Wa_1604578095:
5052 *
5053 * Hang occurs when the number of max threads is less than 2 times
5054 * the number of instance count. The number of max threads must be
5055 * more than 2 times the number of instance count.
5056 */
5057 assert((devinfo->max_tcs_threads / 2) > tcs_data->instances);
5058 hs.DispatchGRFStartRegisterForURBData = shader->dispatch_grf_start_reg & 0x1f;
5059 hs.DispatchGRFStartRegisterForURBData5 = shader->dispatch_grf_start_reg >> 5;
5060 #endif
5061
5062 hs.InstanceCount = tcs_data->instances - 1;
5063 hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
5064 hs.IncludeVertexHandles = true;
5065
5066 #if GFX_VER == 12
5067 /* Patch Count threshold specifies the maximum number of patches that
5068 * will be accumulated before a thread dispatch is forced.
5069 */
5070 hs.PatchCountThreshold = tcs_data->patch_count_threshold;
5071 #endif
5072
5073 #if GFX_VER >= 9
5074 #if GFX_VER < 20
5075 hs.DispatchMode = vue_data->dispatch_mode;
5076 #endif
5077 hs.IncludePrimitiveID = tcs_data->include_primitive_id;
5078 #endif
5079 }
5080 }
5081
5082 /**
5083 * Encode 3DSTATE_TE and most of 3DSTATE_DS based on the compiled shader.
5084 */
5085 static void
iris_store_tes_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5086 iris_store_tes_state(const struct intel_device_info *devinfo,
5087 struct iris_compiled_shader *shader)
5088 {
5089 struct iris_tes_data *tes_data = iris_tes_data(shader);
5090 struct iris_vue_data *vue_data = &tes_data->base;
5091
5092 uint32_t *ds_state = (void *) shader->derived_data;
5093 uint32_t *te_state = ds_state + GENX(3DSTATE_DS_length);
5094
5095 iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
5096 INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
5097
5098 ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
5099 ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
5100 ds.ComputeWCoordinateEnable =
5101 tes_data->domain == INTEL_TESS_DOMAIN_TRI;
5102
5103 #if GFX_VER >= 12
5104 ds.PrimitiveIDNotRequired = !tes_data->include_primitive_id;
5105 #endif
5106 ds.UserClipDistanceCullTestEnableBitmask =
5107 vue_data->cull_distance_mask;
5108 }
5109
5110 iris_pack_command(GENX(3DSTATE_TE), te_state, te) {
5111 te.Partitioning = tes_data->partitioning;
5112 #if GFX_VER >= 20
5113 te.NumberOfRegionsPerPatch = 2;
5114 #endif
5115 te.OutputTopology = tes_data->output_topology;
5116 te.TEDomain = tes_data->domain;
5117 te.TEEnable = true;
5118 te.MaximumTessellationFactorOdd = 63.0;
5119 te.MaximumTessellationFactorNotOdd = 64.0;
5120 #if GFX_VERx10 >= 125
5121 STATIC_ASSERT(TEDMODE_OFF == 0);
5122 if (intel_needs_workaround(devinfo, 14015055625)) {
5123 te.TessellationDistributionMode = TEDMODE_OFF;
5124 } else if (intel_needs_workaround(devinfo, 22012699309)) {
5125 te.TessellationDistributionMode = TEDMODE_RR_STRICT;
5126 } else {
5127 te.TessellationDistributionMode = TEDMODE_RR_FREE;
5128 }
5129
5130 #if GFX_VER >= 20
5131 te.TessellationDistributionLevel = TEDLEVEL_REGION;
5132 #else
5133 te.TessellationDistributionLevel = TEDLEVEL_PATCH;
5134 #endif
5135 /* 64_TRIANGLES */
5136 te.SmallPatchThreshold = 3;
5137 /* 1K_TRIANGLES */
5138 te.TargetBlockSize = 8;
5139 /* 1K_TRIANGLES */
5140 te.LocalBOPAccumulatorThreshold = 1;
5141 #endif
5142 }
5143 }
5144
5145 /**
5146 * Encode most of 3DSTATE_GS based on the compiled shader.
5147 */
5148 static void
iris_store_gs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5149 iris_store_gs_state(const struct intel_device_info *devinfo,
5150 struct iris_compiled_shader *shader)
5151 {
5152 struct iris_gs_data *gs_data = iris_gs_data(shader);
5153 struct iris_vue_data *vue_data = &gs_data->base;
5154
5155 iris_pack_command(GENX(3DSTATE_GS), shader->derived_data, gs) {
5156 INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
5157
5158 gs.OutputVertexSize = gs_data->output_vertex_size_hwords * 2 - 1;
5159 gs.OutputTopology = gs_data->output_topology;
5160 gs.ControlDataHeaderSize = gs_data->control_data_header_size_hwords;
5161 gs.InstanceControl = gs_data->invocations - 1;
5162 #if GFX_VER < 20
5163 gs.DispatchMode = DISPATCH_MODE_SIMD8;
5164 #endif
5165 gs.IncludePrimitiveID = gs_data->include_primitive_id;
5166 gs.ControlDataFormat = gs_data->control_data_format;
5167 gs.ReorderMode = TRAILING;
5168 gs.ExpectedVertexCount = gs_data->vertices_in;
5169 gs.MaximumNumberofThreads =
5170 GFX_VER == 8 ? (devinfo->max_gs_threads / 2 - 1)
5171 : (devinfo->max_gs_threads - 1);
5172
5173 if (gs_data->static_vertex_count != -1) {
5174 gs.StaticOutput = true;
5175 gs.StaticOutputVertexCount = gs_data->static_vertex_count;
5176 }
5177 gs.IncludeVertexHandles = vue_data->include_vue_handles;
5178
5179 gs.UserClipDistanceCullTestEnableBitmask = vue_data->cull_distance_mask;
5180
5181 const int urb_entry_write_offset = 1;
5182 const uint32_t urb_entry_output_length =
5183 DIV_ROUND_UP(vue_data->vue_map.num_slots, 2) - urb_entry_write_offset;
5184
5185 gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
5186 gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
5187 }
5188 }
5189
5190 /**
5191 * Encode most of 3DSTATE_PS and 3DSTATE_PS_EXTRA based on the shader.
5192 */
5193 static void
iris_store_fs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5194 iris_store_fs_state(const struct intel_device_info *devinfo,
5195 struct iris_compiled_shader *shader)
5196 {
5197 struct iris_fs_data *fs_data = iris_fs_data(shader);
5198
5199 uint32_t *ps_state = (void *) shader->derived_data;
5200 uint32_t *psx_state = ps_state + GENX(3DSTATE_PS_length);
5201
5202 iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) {
5203 ps.VectorMaskEnable = fs_data->uses_vmask;
5204 ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
5205 ps.SamplerCount = encode_sampler_count(shader);
5206 ps.FloatingPointMode = shader->use_alt_mode;
5207 ps.MaximumNumberofThreadsPerPSD =
5208 devinfo->max_threads_per_psd - (GFX_VER == 8 ? 2 : 1);
5209
5210 #if GFX_VER < 20
5211 ps.PushConstantEnable = devinfo->needs_null_push_constant_tbimr_workaround ||
5212 shader->ubo_ranges[0].length > 0;
5213 #endif
5214
5215 /* From the documentation for this packet:
5216 * "If the PS kernel does not need the Position XY Offsets to
5217 * compute a Position Value, then this field should be programmed
5218 * to POSOFFSET_NONE."
5219 *
5220 * "SW Recommendation: If the PS kernel needs the Position Offsets
5221 * to compute a Position XY value, this field should match Position
5222 * ZW Interpolation Mode to ensure a consistent position.xyzw
5223 * computation."
5224 *
5225 * We only require XY sample offsets. So, this recommendation doesn't
5226 * look useful at the moment. We might need this in future.
5227 */
5228 ps.PositionXYOffsetSelect =
5229 fs_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
5230
5231 if (shader->total_scratch) {
5232 INIT_THREAD_SCRATCH_SIZE(ps);
5233 }
5234 }
5235
5236 iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
5237 psx.PixelShaderValid = true;
5238 psx.PixelShaderComputedDepthMode = fs_data->computed_depth_mode;
5239 psx.PixelShaderKillsPixel = fs_data->uses_kill;
5240 #if GFX_VER < 20
5241 psx.AttributeEnable = fs_data->num_varying_inputs != 0;
5242 #endif
5243 psx.PixelShaderUsesSourceDepth = fs_data->uses_src_depth;
5244 psx.PixelShaderUsesSourceW = fs_data->uses_src_w;
5245 psx.PixelShaderIsPerSample = fs_data->is_per_sample;
5246 psx.oMaskPresenttoRenderTarget = fs_data->uses_omask;
5247
5248 #if GFX_VER >= 9
5249 #if GFX_VER >= 20
5250 assert(!fs_data->pulls_bary);
5251 #else
5252 psx.PixelShaderPullsBary = fs_data->pulls_bary;
5253 #endif
5254 psx.PixelShaderComputesStencil = fs_data->computed_stencil;
5255 #endif
5256
5257 #if GFX_VER >= 11
5258 psx.PixelShaderRequiresSubpixelSampleOffsets =
5259 fs_data->uses_sample_offsets;
5260 psx.PixelShaderRequiresNonPerspectiveBaryPlaneCoefficients =
5261 fs_data->uses_npc_bary_coefficients;
5262 psx.PixelShaderRequiresPerspectiveBaryPlaneCoefficients =
5263 fs_data->uses_pc_bary_coefficients;
5264 psx.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =
5265 fs_data->uses_depth_w_coefficients;
5266 #endif
5267 }
5268 }
5269
5270 /**
5271 * Compute the size of the derived data (shader command packets).
5272 *
5273 * This must match the data written by the iris_store_xs_state() functions.
5274 */
5275 static void
iris_store_cs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5276 iris_store_cs_state(const struct intel_device_info *devinfo,
5277 struct iris_compiled_shader *shader)
5278 {
5279 struct iris_cs_data *cs_data = iris_cs_data(shader);
5280 void *map = shader->derived_data;
5281
5282 iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), map, desc) {
5283 #if GFX_VERx10 < 125
5284 desc.ConstantURBEntryReadLength = cs_data->push.per_thread.regs;
5285 desc.CrossThreadConstantDataReadLength =
5286 cs_data->push.cross_thread.regs;
5287 #else
5288 assert(cs_data->push.per_thread.regs == 0);
5289 assert(cs_data->push.cross_thread.regs == 0);
5290 #endif
5291 #if GFX_VERx10 <= 125
5292 desc.BarrierEnable = cs_data->uses_barrier;
5293 #endif
5294 /* Typically set to 0 to avoid prefetching on every thread dispatch. */
5295 desc.BindingTableEntryCount = devinfo->verx10 == 125 ?
5296 0 : MIN2(shader->bt.size_bytes / 4, 31);
5297 desc.SamplerCount = encode_sampler_count(shader);
5298 /* TODO: Check if we are missing workarounds and enable mid-thread
5299 * preemption.
5300 *
5301 * We still have issues with mid-thread preemption (it was already
5302 * disabled by the kernel on gfx11, due to missing workarounds). It's
5303 * possible that we are just missing some workarounds, and could enable
5304 * it later, but for now let's disable it to fix a GPU in compute in Car
5305 * Chase (and possibly more).
5306 */
5307 #if GFX_VER >= 20
5308 desc.ThreadPreemption = false;
5309 #elif GFX_VER >= 12
5310 desc.ThreadPreemptionDisable = true;
5311 #endif
5312 }
5313 }
5314
5315 static unsigned
iris_derived_program_state_size(enum iris_program_cache_id cache_id)5316 iris_derived_program_state_size(enum iris_program_cache_id cache_id)
5317 {
5318 assert(cache_id <= IRIS_CACHE_BLORP);
5319
5320 static const unsigned dwords[] = {
5321 [IRIS_CACHE_VS] = GENX(3DSTATE_VS_length),
5322 [IRIS_CACHE_TCS] = GENX(3DSTATE_HS_length),
5323 [IRIS_CACHE_TES] = GENX(3DSTATE_TE_length) + GENX(3DSTATE_DS_length),
5324 [IRIS_CACHE_GS] = GENX(3DSTATE_GS_length),
5325 [IRIS_CACHE_FS] =
5326 GENX(3DSTATE_PS_length) + GENX(3DSTATE_PS_EXTRA_length),
5327 [IRIS_CACHE_CS] = GENX(INTERFACE_DESCRIPTOR_DATA_length),
5328 [IRIS_CACHE_BLORP] = 0,
5329 };
5330
5331 return sizeof(uint32_t) * dwords[cache_id];
5332 }
5333
5334 /**
5335 * Create any state packets corresponding to the given shader stage
5336 * (i.e. 3DSTATE_VS) and save them as "derived data" in the shader variant.
5337 * This means that we can look up a program in the in-memory cache and
5338 * get most of the state packet without having to reconstruct it.
5339 */
5340 static void
iris_store_derived_program_state(const struct intel_device_info * devinfo,enum iris_program_cache_id cache_id,struct iris_compiled_shader * shader)5341 iris_store_derived_program_state(const struct intel_device_info *devinfo,
5342 enum iris_program_cache_id cache_id,
5343 struct iris_compiled_shader *shader)
5344 {
5345 switch (cache_id) {
5346 case IRIS_CACHE_VS:
5347 iris_store_vs_state(devinfo, shader);
5348 break;
5349 case IRIS_CACHE_TCS:
5350 iris_store_tcs_state(devinfo, shader);
5351 break;
5352 case IRIS_CACHE_TES:
5353 iris_store_tes_state(devinfo, shader);
5354 break;
5355 case IRIS_CACHE_GS:
5356 iris_store_gs_state(devinfo, shader);
5357 break;
5358 case IRIS_CACHE_FS:
5359 iris_store_fs_state(devinfo, shader);
5360 break;
5361 case IRIS_CACHE_CS:
5362 iris_store_cs_state(devinfo, shader);
5363 break;
5364 case IRIS_CACHE_BLORP:
5365 break;
5366 }
5367 }
5368
5369 /* ------------------------------------------------------------------- */
5370
5371 static const uint32_t push_constant_opcodes[] = {
5372 [MESA_SHADER_VERTEX] = 21,
5373 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
5374 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
5375 [MESA_SHADER_GEOMETRY] = 22,
5376 [MESA_SHADER_FRAGMENT] = 23,
5377 [MESA_SHADER_COMPUTE] = 0,
5378 };
5379
5380 static uint32_t
use_null_surface(struct iris_batch * batch,struct iris_context * ice)5381 use_null_surface(struct iris_batch *batch, struct iris_context *ice)
5382 {
5383 struct iris_bo *state_bo = iris_resource_bo(ice->state.unbound_tex.res);
5384
5385 iris_use_pinned_bo(batch, state_bo, false, IRIS_DOMAIN_NONE);
5386
5387 return ice->state.unbound_tex.offset;
5388 }
5389
5390 static uint32_t
use_null_fb_surface(struct iris_batch * batch,struct iris_context * ice)5391 use_null_fb_surface(struct iris_batch *batch, struct iris_context *ice)
5392 {
5393 /* If set_framebuffer_state() was never called, fall back to 1x1x1 */
5394 if (!ice->state.null_fb.res)
5395 return use_null_surface(batch, ice);
5396
5397 struct iris_bo *state_bo = iris_resource_bo(ice->state.null_fb.res);
5398
5399 iris_use_pinned_bo(batch, state_bo, false, IRIS_DOMAIN_NONE);
5400
5401 return ice->state.null_fb.offset;
5402 }
5403
5404 static uint32_t
surf_state_offset_for_aux(unsigned aux_modes,enum isl_aux_usage aux_usage)5405 surf_state_offset_for_aux(unsigned aux_modes,
5406 enum isl_aux_usage aux_usage)
5407 {
5408 assert(aux_modes & (1 << aux_usage));
5409 return SURFACE_STATE_ALIGNMENT *
5410 util_bitcount(aux_modes & ((1 << aux_usage) - 1));
5411 }
5412
5413 #if GFX_VER == 9
5414 static void
surf_state_update_clear_value(struct iris_batch * batch,struct iris_resource * res,struct iris_surface_state * surf_state,enum isl_aux_usage aux_usage)5415 surf_state_update_clear_value(struct iris_batch *batch,
5416 struct iris_resource *res,
5417 struct iris_surface_state *surf_state,
5418 enum isl_aux_usage aux_usage)
5419 {
5420 struct isl_device *isl_dev = &batch->screen->isl_dev;
5421 struct iris_bo *state_bo = iris_resource_bo(surf_state->ref.res);
5422 uint64_t real_offset = surf_state->ref.offset + IRIS_MEMZONE_BINDER_START;
5423 uint32_t offset_into_bo = real_offset - state_bo->address;
5424 uint32_t clear_offset = offset_into_bo +
5425 isl_dev->ss.clear_value_offset +
5426 surf_state_offset_for_aux(surf_state->aux_usages, aux_usage);
5427 uint32_t *color = res->aux.clear_color.u32;
5428
5429 assert(isl_dev->ss.clear_value_size == 16);
5430
5431 if (aux_usage == ISL_AUX_USAGE_HIZ) {
5432 iris_emit_pipe_control_write(batch, "update fast clear value (Z)",
5433 PIPE_CONTROL_WRITE_IMMEDIATE,
5434 state_bo, clear_offset, color[0]);
5435 } else {
5436 iris_emit_pipe_control_write(batch, "update fast clear color (RG__)",
5437 PIPE_CONTROL_WRITE_IMMEDIATE,
5438 state_bo, clear_offset,
5439 (uint64_t) color[0] |
5440 (uint64_t) color[1] << 32);
5441 iris_emit_pipe_control_write(batch, "update fast clear color (__BA)",
5442 PIPE_CONTROL_WRITE_IMMEDIATE,
5443 state_bo, clear_offset + 8,
5444 (uint64_t) color[2] |
5445 (uint64_t) color[3] << 32);
5446 }
5447
5448 iris_emit_pipe_control_flush(batch,
5449 "update fast clear: state cache invalidate",
5450 PIPE_CONTROL_FLUSH_ENABLE |
5451 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
5452 }
5453 #endif
5454
5455 static void
update_clear_value(struct iris_context * ice,struct iris_batch * batch,struct iris_resource * res,struct iris_surface_state * surf_state,struct isl_view * view)5456 update_clear_value(struct iris_context *ice,
5457 struct iris_batch *batch,
5458 struct iris_resource *res,
5459 struct iris_surface_state *surf_state,
5460 struct isl_view *view)
5461 {
5462 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5463 UNUSED unsigned aux_modes = surf_state->aux_usages;
5464
5465 /* We only need to update the clear color in the surface state for gfx8 and
5466 * gfx9. Newer gens can read it directly from the clear color state buffer.
5467 */
5468 #if GFX_VER == 9
5469 /* Skip updating the ISL_AUX_USAGE_NONE surface state */
5470 aux_modes &= ~(1 << ISL_AUX_USAGE_NONE);
5471
5472 while (aux_modes) {
5473 enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);
5474
5475 surf_state_update_clear_value(batch, res, surf_state, aux_usage);
5476 }
5477 #elif GFX_VER == 8
5478 /* TODO: Could update rather than re-filling */
5479 alloc_surface_states(surf_state, surf_state->aux_usages);
5480
5481 fill_surface_states(isl_dev, surf_state, res, &res->surf, view, 0, 0, 0);
5482
5483 upload_surface_states(ice->state.surface_uploader, surf_state);
5484 #endif
5485 }
5486
5487 static uint32_t
use_surface_state(struct iris_batch * batch,struct iris_surface_state * surf_state,enum isl_aux_usage aux_usage)5488 use_surface_state(struct iris_batch *batch,
5489 struct iris_surface_state *surf_state,
5490 enum isl_aux_usage aux_usage)
5491 {
5492 iris_use_pinned_bo(batch, iris_resource_bo(surf_state->ref.res), false,
5493 IRIS_DOMAIN_NONE);
5494
5495 return surf_state->ref.offset +
5496 surf_state_offset_for_aux(surf_state->aux_usages, aux_usage);
5497 }
5498
5499 /**
5500 * Add a surface to the validation list, as well as the buffer containing
5501 * the corresponding SURFACE_STATE.
5502 *
5503 * Returns the binding table entry (offset to SURFACE_STATE).
5504 */
5505 static uint32_t
use_surface(struct iris_context * ice,struct iris_batch * batch,struct pipe_surface * p_surf,bool writeable,enum isl_aux_usage aux_usage,bool is_read_surface,enum iris_domain access)5506 use_surface(struct iris_context *ice,
5507 struct iris_batch *batch,
5508 struct pipe_surface *p_surf,
5509 bool writeable,
5510 enum isl_aux_usage aux_usage,
5511 bool is_read_surface,
5512 enum iris_domain access)
5513 {
5514 struct iris_surface *surf = (void *) p_surf;
5515 struct iris_resource *res = (void *) p_surf->texture;
5516
5517 if (GFX_VER == 8 && is_read_surface && !surf->surface_state_read.ref.res) {
5518 upload_surface_states(ice->state.surface_uploader,
5519 &surf->surface_state_read);
5520 }
5521
5522 if (!surf->surface_state.ref.res) {
5523 upload_surface_states(ice->state.surface_uploader,
5524 &surf->surface_state);
5525 }
5526
5527 if (memcmp(&res->aux.clear_color, &surf->clear_color,
5528 sizeof(surf->clear_color)) != 0) {
5529 update_clear_value(ice, batch, res, &surf->surface_state, &surf->view);
5530 if (GFX_VER == 8) {
5531 update_clear_value(ice, batch, res, &surf->surface_state_read,
5532 &surf->read_view);
5533 }
5534 surf->clear_color = res->aux.clear_color;
5535 }
5536
5537 if (res->aux.clear_color_bo)
5538 iris_use_pinned_bo(batch, res->aux.clear_color_bo, false, access);
5539
5540 if (res->aux.bo)
5541 iris_use_pinned_bo(batch, res->aux.bo, writeable, access);
5542
5543 iris_use_pinned_bo(batch, res->bo, writeable, access);
5544
5545 if (GFX_VER == 8 && is_read_surface) {
5546 return use_surface_state(batch, &surf->surface_state_read, aux_usage);
5547 } else {
5548 return use_surface_state(batch, &surf->surface_state, aux_usage);
5549 }
5550 }
5551
5552 static uint32_t
use_sampler_view(struct iris_context * ice,struct iris_batch * batch,struct iris_sampler_view * isv)5553 use_sampler_view(struct iris_context *ice,
5554 struct iris_batch *batch,
5555 struct iris_sampler_view *isv)
5556 {
5557 enum isl_aux_usage aux_usage =
5558 iris_resource_texture_aux_usage(ice, isv->res, isv->view.format,
5559 isv->view.base_level, isv->view.levels);
5560
5561 if (!isv->surface_state.ref.res)
5562 upload_surface_states(ice->state.surface_uploader, &isv->surface_state);
5563
5564 if (memcmp(&isv->res->aux.clear_color, &isv->clear_color,
5565 sizeof(isv->clear_color)) != 0) {
5566 update_clear_value(ice, batch, isv->res, &isv->surface_state,
5567 &isv->view);
5568 isv->clear_color = isv->res->aux.clear_color;
5569 }
5570
5571 if (isv->res->aux.clear_color_bo) {
5572 iris_use_pinned_bo(batch, isv->res->aux.clear_color_bo,
5573 false, IRIS_DOMAIN_SAMPLER_READ);
5574 }
5575
5576 if (isv->res->aux.bo) {
5577 iris_use_pinned_bo(batch, isv->res->aux.bo,
5578 false, IRIS_DOMAIN_SAMPLER_READ);
5579 }
5580
5581 iris_use_pinned_bo(batch, isv->res->bo, false, IRIS_DOMAIN_SAMPLER_READ);
5582
5583 return use_surface_state(batch, &isv->surface_state, aux_usage);
5584 }
5585
5586 static uint32_t
use_ubo_ssbo(struct iris_batch * batch,struct iris_context * ice,struct pipe_shader_buffer * buf,struct iris_state_ref * surf_state,bool writable,enum iris_domain access)5587 use_ubo_ssbo(struct iris_batch *batch,
5588 struct iris_context *ice,
5589 struct pipe_shader_buffer *buf,
5590 struct iris_state_ref *surf_state,
5591 bool writable, enum iris_domain access)
5592 {
5593 if (!buf->buffer || !surf_state->res)
5594 return use_null_surface(batch, ice);
5595
5596 iris_use_pinned_bo(batch, iris_resource_bo(buf->buffer), writable, access);
5597 iris_use_pinned_bo(batch, iris_resource_bo(surf_state->res), false,
5598 IRIS_DOMAIN_NONE);
5599
5600 return surf_state->offset;
5601 }
5602
5603 static uint32_t
use_image(struct iris_batch * batch,struct iris_context * ice,struct iris_shader_state * shs,const struct shader_info * info,int i)5604 use_image(struct iris_batch *batch, struct iris_context *ice,
5605 struct iris_shader_state *shs, const struct shader_info *info,
5606 int i)
5607 {
5608 struct iris_image_view *iv = &shs->image[i];
5609 struct iris_resource *res = (void *) iv->base.resource;
5610
5611 if (!res)
5612 return use_null_surface(batch, ice);
5613
5614 bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
5615
5616 iris_use_pinned_bo(batch, res->bo, write, IRIS_DOMAIN_NONE);
5617
5618 if (res->aux.bo)
5619 iris_use_pinned_bo(batch, res->aux.bo, write, IRIS_DOMAIN_NONE);
5620
5621 if (res->aux.clear_color_bo) {
5622 iris_use_pinned_bo(batch, res->aux.clear_color_bo, false,
5623 IRIS_DOMAIN_NONE);
5624 }
5625
5626 enum isl_aux_usage aux_usage = shs->image_aux_usage[i];
5627
5628 return use_surface_state(batch, &iv->surface_state, aux_usage);
5629 }
5630
5631 #define push_bt_entry(addr) \
5632 assert(addr >= surf_base_offset); \
5633 assert(s < shader->bt.size_bytes / sizeof(uint32_t)); \
5634 if (!pin_only) bt_map[s++] = (addr) - surf_base_offset;
5635
5636 #define bt_assert(section) \
5637 if (!pin_only && shader->bt.used_mask[section] != 0) \
5638 assert(shader->bt.offsets[section] == s);
5639
5640 /**
5641 * Populate the binding table for a given shader stage.
5642 *
5643 * This fills out the table of pointers to surfaces required by the shader,
5644 * and also adds those buffers to the validation list so the kernel can make
5645 * resident before running our batch.
5646 */
5647 static void
iris_populate_binding_table(struct iris_context * ice,struct iris_batch * batch,gl_shader_stage stage,bool pin_only)5648 iris_populate_binding_table(struct iris_context *ice,
5649 struct iris_batch *batch,
5650 gl_shader_stage stage,
5651 bool pin_only)
5652 {
5653 const struct iris_binder *binder = &ice->state.binder;
5654 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
5655 if (!shader)
5656 return;
5657
5658 struct iris_binding_table *bt = &shader->bt;
5659 struct iris_shader_state *shs = &ice->state.shaders[stage];
5660 uint32_t surf_base_offset = GFX_VER < 11 ? binder->bo->address : 0;
5661
5662 uint32_t *bt_map = binder->map + binder->bt_offset[stage];
5663 int s = 0;
5664
5665 const struct shader_info *info = iris_get_shader_info(ice, stage);
5666 if (!info) {
5667 /* TCS passthrough doesn't need a binding table. */
5668 assert(stage == MESA_SHADER_TESS_CTRL);
5669 return;
5670 }
5671
5672 if (stage == MESA_SHADER_COMPUTE &&
5673 shader->bt.used_mask[IRIS_SURFACE_GROUP_CS_WORK_GROUPS]) {
5674 /* surface for gl_NumWorkGroups */
5675 struct iris_state_ref *grid_data = &ice->state.grid_size;
5676 struct iris_state_ref *grid_state = &ice->state.grid_surf_state;
5677 iris_use_pinned_bo(batch, iris_resource_bo(grid_data->res), false,
5678 IRIS_DOMAIN_PULL_CONSTANT_READ);
5679 iris_use_pinned_bo(batch, iris_resource_bo(grid_state->res), false,
5680 IRIS_DOMAIN_NONE);
5681 push_bt_entry(grid_state->offset);
5682 }
5683
5684 if (stage == MESA_SHADER_FRAGMENT) {
5685 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5686 /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
5687 if (cso_fb->nr_cbufs) {
5688 for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
5689 uint32_t addr;
5690 if (cso_fb->cbufs[i]) {
5691 addr = use_surface(ice, batch, cso_fb->cbufs[i], true,
5692 ice->state.draw_aux_usage[i], false,
5693 IRIS_DOMAIN_RENDER_WRITE);
5694 } else {
5695 addr = use_null_fb_surface(batch, ice);
5696 }
5697 push_bt_entry(addr);
5698 }
5699 } else if (GFX_VER < 11) {
5700 uint32_t addr = use_null_fb_surface(batch, ice);
5701 push_bt_entry(addr);
5702 }
5703 }
5704
5705 #define foreach_surface_used(index, group) \
5706 bt_assert(group); \
5707 for (int index = 0; index < bt->sizes[group]; index++) \
5708 if (iris_group_index_to_bti(bt, group, index) != \
5709 IRIS_SURFACE_NOT_USED)
5710
5711 foreach_surface_used(i, IRIS_SURFACE_GROUP_RENDER_TARGET_READ) {
5712 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5713 uint32_t addr;
5714 if (cso_fb->cbufs[i]) {
5715 addr = use_surface(ice, batch, cso_fb->cbufs[i],
5716 false, ice->state.draw_aux_usage[i], true,
5717 IRIS_DOMAIN_SAMPLER_READ);
5718 push_bt_entry(addr);
5719 }
5720 }
5721
5722 foreach_surface_used(i, IRIS_SURFACE_GROUP_TEXTURE_LOW64) {
5723 struct iris_sampler_view *view = shs->textures[i];
5724 uint32_t addr = view ? use_sampler_view(ice, batch, view)
5725 : use_null_surface(batch, ice);
5726 push_bt_entry(addr);
5727 }
5728
5729 foreach_surface_used(i, IRIS_SURFACE_GROUP_TEXTURE_HIGH64) {
5730 struct iris_sampler_view *view = shs->textures[64 + i];
5731 uint32_t addr = view ? use_sampler_view(ice, batch, view)
5732 : use_null_surface(batch, ice);
5733 push_bt_entry(addr);
5734 }
5735
5736 foreach_surface_used(i, IRIS_SURFACE_GROUP_IMAGE) {
5737 uint32_t addr = use_image(batch, ice, shs, info, i);
5738 push_bt_entry(addr);
5739 }
5740
5741 foreach_surface_used(i, IRIS_SURFACE_GROUP_UBO) {
5742 uint32_t addr = use_ubo_ssbo(batch, ice, &shs->constbuf[i],
5743 &shs->constbuf_surf_state[i], false,
5744 IRIS_DOMAIN_PULL_CONSTANT_READ);
5745 push_bt_entry(addr);
5746 }
5747
5748 foreach_surface_used(i, IRIS_SURFACE_GROUP_SSBO) {
5749 uint32_t addr =
5750 use_ubo_ssbo(batch, ice, &shs->ssbo[i], &shs->ssbo_surf_state[i],
5751 shs->writable_ssbos & (1u << i), IRIS_DOMAIN_NONE);
5752 push_bt_entry(addr);
5753 }
5754
5755 #if 0
5756 /* XXX: YUV surfaces not implemented yet */
5757 bt_assert(plane_start[1], ...);
5758 bt_assert(plane_start[2], ...);
5759 #endif
5760 }
5761
5762 static void
iris_use_optional_res(struct iris_batch * batch,struct pipe_resource * res,bool writeable,enum iris_domain access)5763 iris_use_optional_res(struct iris_batch *batch,
5764 struct pipe_resource *res,
5765 bool writeable,
5766 enum iris_domain access)
5767 {
5768 if (res) {
5769 struct iris_bo *bo = iris_resource_bo(res);
5770 iris_use_pinned_bo(batch, bo, writeable, access);
5771 }
5772 }
5773
5774 static void
pin_depth_and_stencil_buffers(struct iris_batch * batch,struct pipe_surface * zsbuf,struct iris_depth_stencil_alpha_state * cso_zsa)5775 pin_depth_and_stencil_buffers(struct iris_batch *batch,
5776 struct pipe_surface *zsbuf,
5777 struct iris_depth_stencil_alpha_state *cso_zsa)
5778 {
5779 if (!zsbuf)
5780 return;
5781
5782 struct iris_resource *zres, *sres;
5783 iris_get_depth_stencil_resources(zsbuf->texture, &zres, &sres);
5784
5785 if (zres) {
5786 iris_use_pinned_bo(batch, zres->bo, cso_zsa->depth_writes_enabled,
5787 IRIS_DOMAIN_DEPTH_WRITE);
5788 if (zres->aux.bo) {
5789 iris_use_pinned_bo(batch, zres->aux.bo,
5790 cso_zsa->depth_writes_enabled,
5791 IRIS_DOMAIN_DEPTH_WRITE);
5792 }
5793 }
5794
5795 if (sres) {
5796 iris_use_pinned_bo(batch, sres->bo, cso_zsa->stencil_writes_enabled,
5797 IRIS_DOMAIN_DEPTH_WRITE);
5798 }
5799 }
5800
5801 static uint32_t
pin_scratch_space(struct iris_context * ice,struct iris_batch * batch,const struct iris_compiled_shader * shader,gl_shader_stage stage)5802 pin_scratch_space(struct iris_context *ice,
5803 struct iris_batch *batch,
5804 const struct iris_compiled_shader *shader,
5805 gl_shader_stage stage)
5806 {
5807 uint32_t scratch_addr = 0;
5808
5809 if (shader->total_scratch > 0) {
5810 struct iris_bo *scratch_bo =
5811 iris_get_scratch_space(ice, shader->total_scratch, stage);
5812 iris_use_pinned_bo(batch, scratch_bo, true, IRIS_DOMAIN_NONE);
5813
5814 #if GFX_VERx10 >= 125
5815 const struct iris_state_ref *ref =
5816 iris_get_scratch_surf(ice, shader->total_scratch);
5817 iris_use_pinned_bo(batch, iris_resource_bo(ref->res),
5818 false, IRIS_DOMAIN_NONE);
5819 scratch_addr = ref->offset +
5820 iris_resource_bo(ref->res)->address -
5821 IRIS_MEMZONE_SCRATCH_START;
5822 assert((scratch_addr & 0x3f) == 0 && scratch_addr < (1 << 26));
5823 #else
5824 scratch_addr = scratch_bo->address;
5825 #endif
5826 }
5827
5828 return scratch_addr;
5829 }
5830
5831 /* ------------------------------------------------------------------- */
5832
5833 /**
5834 * Pin any BOs which were installed by a previous batch, and restored
5835 * via the hardware logical context mechanism.
5836 *
5837 * We don't need to re-emit all state every batch - the hardware context
5838 * mechanism will save and restore it for us. This includes pointers to
5839 * various BOs...which won't exist unless we ask the kernel to pin them
5840 * by adding them to the validation list.
5841 *
5842 * We can skip buffers if we've re-emitted those packets, as we're
5843 * overwriting those stale pointers with new ones, and don't actually
5844 * refer to the old BOs.
5845 */
5846 static void
iris_restore_render_saved_bos(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw)5847 iris_restore_render_saved_bos(struct iris_context *ice,
5848 struct iris_batch *batch,
5849 const struct pipe_draw_info *draw)
5850 {
5851 struct iris_genx_state *genx = ice->state.genx;
5852
5853 const uint64_t clean = ~ice->state.dirty;
5854 const uint64_t stage_clean = ~ice->state.stage_dirty;
5855
5856 if (clean & IRIS_DIRTY_CC_VIEWPORT) {
5857 iris_use_optional_res(batch, ice->state.last_res.cc_vp, false,
5858 IRIS_DOMAIN_NONE);
5859 }
5860
5861 if (clean & IRIS_DIRTY_SF_CL_VIEWPORT) {
5862 iris_use_optional_res(batch, ice->state.last_res.sf_cl_vp, false,
5863 IRIS_DOMAIN_NONE);
5864 }
5865
5866 if (clean & IRIS_DIRTY_BLEND_STATE) {
5867 iris_use_optional_res(batch, ice->state.last_res.blend, false,
5868 IRIS_DOMAIN_NONE);
5869 }
5870
5871 if (clean & IRIS_DIRTY_COLOR_CALC_STATE) {
5872 iris_use_optional_res(batch, ice->state.last_res.color_calc, false,
5873 IRIS_DOMAIN_NONE);
5874 }
5875
5876 if (clean & IRIS_DIRTY_SCISSOR_RECT) {
5877 iris_use_optional_res(batch, ice->state.last_res.scissor, false,
5878 IRIS_DOMAIN_NONE);
5879 }
5880
5881 if (ice->state.streamout_active && (clean & IRIS_DIRTY_SO_BUFFERS)) {
5882 for (int i = 0; i < 4; i++) {
5883 struct iris_stream_output_target *tgt =
5884 (void *) ice->state.so_target[i];
5885 if (tgt) {
5886 iris_use_pinned_bo(batch, iris_resource_bo(tgt->base.buffer),
5887 true, IRIS_DOMAIN_OTHER_WRITE);
5888 iris_use_pinned_bo(batch, iris_resource_bo(tgt->offset.res),
5889 true, IRIS_DOMAIN_OTHER_WRITE);
5890 }
5891 }
5892 }
5893
5894 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
5895 if (!(stage_clean & (IRIS_STAGE_DIRTY_CONSTANTS_VS << stage)))
5896 continue;
5897
5898 struct iris_shader_state *shs = &ice->state.shaders[stage];
5899 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
5900
5901 if (!shader)
5902 continue;
5903
5904 for (int i = 0; i < 4; i++) {
5905 const struct iris_ubo_range *range = &shader->ubo_ranges[i];
5906
5907 if (range->length == 0)
5908 continue;
5909
5910 /* Range block is a binding table index, map back to UBO index. */
5911 unsigned block_index = iris_bti_to_group_index(
5912 &shader->bt, IRIS_SURFACE_GROUP_UBO, range->block);
5913 assert(block_index != IRIS_SURFACE_NOT_USED);
5914
5915 struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index];
5916 struct iris_resource *res = (void *) cbuf->buffer;
5917
5918 if (res)
5919 iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_OTHER_READ);
5920 else
5921 iris_use_pinned_bo(batch, batch->screen->workaround_bo, false,
5922 IRIS_DOMAIN_OTHER_READ);
5923 }
5924 }
5925
5926 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
5927 if (stage_clean & (IRIS_STAGE_DIRTY_BINDINGS_VS << stage)) {
5928 /* Re-pin any buffers referred to by the binding table. */
5929 iris_populate_binding_table(ice, batch, stage, true);
5930 }
5931 }
5932
5933 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
5934 struct iris_shader_state *shs = &ice->state.shaders[stage];
5935 struct pipe_resource *res = shs->sampler_table.res;
5936 if (res)
5937 iris_use_pinned_bo(batch, iris_resource_bo(res), false,
5938 IRIS_DOMAIN_NONE);
5939 }
5940
5941 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
5942 if (stage_clean & (IRIS_STAGE_DIRTY_VS << stage)) {
5943 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
5944
5945 if (shader) {
5946 struct iris_bo *bo = iris_resource_bo(shader->assembly.res);
5947 iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
5948
5949 pin_scratch_space(ice, batch, shader, stage);
5950 }
5951 }
5952 }
5953
5954 if ((clean & IRIS_DIRTY_DEPTH_BUFFER) &&
5955 (clean & IRIS_DIRTY_WM_DEPTH_STENCIL)) {
5956 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5957 pin_depth_and_stencil_buffers(batch, cso_fb->zsbuf, ice->state.cso_zsa);
5958 }
5959
5960 iris_use_optional_res(batch, ice->state.last_res.index_buffer, false,
5961 IRIS_DOMAIN_VF_READ);
5962
5963 if (clean & IRIS_DIRTY_VERTEX_BUFFERS) {
5964 uint64_t bound = ice->state.bound_vertex_buffers;
5965 while (bound) {
5966 const int i = u_bit_scan64(&bound);
5967 struct pipe_resource *res = genx->vertex_buffers[i].resource;
5968 iris_use_pinned_bo(batch, iris_resource_bo(res), false,
5969 IRIS_DOMAIN_VF_READ);
5970 }
5971 }
5972 }
5973
5974 static void
iris_restore_compute_saved_bos(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)5975 iris_restore_compute_saved_bos(struct iris_context *ice,
5976 struct iris_batch *batch,
5977 const struct pipe_grid_info *grid)
5978 {
5979 const uint64_t stage_clean = ~ice->state.stage_dirty;
5980
5981 const int stage = MESA_SHADER_COMPUTE;
5982 struct iris_shader_state *shs = &ice->state.shaders[stage];
5983
5984 if (stage_clean & IRIS_STAGE_DIRTY_BINDINGS_CS) {
5985 /* Re-pin any buffers referred to by the binding table. */
5986 iris_populate_binding_table(ice, batch, stage, true);
5987 }
5988
5989 struct pipe_resource *sampler_res = shs->sampler_table.res;
5990 if (sampler_res)
5991 iris_use_pinned_bo(batch, iris_resource_bo(sampler_res), false,
5992 IRIS_DOMAIN_NONE);
5993
5994 if ((stage_clean & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS) &&
5995 (stage_clean & IRIS_STAGE_DIRTY_BINDINGS_CS) &&
5996 (stage_clean & IRIS_STAGE_DIRTY_CONSTANTS_CS) &&
5997 (stage_clean & IRIS_STAGE_DIRTY_CS)) {
5998 iris_use_optional_res(batch, ice->state.last_res.cs_desc, false,
5999 IRIS_DOMAIN_NONE);
6000 }
6001
6002 if (stage_clean & IRIS_STAGE_DIRTY_CS) {
6003 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
6004
6005 if (shader) {
6006 struct iris_bo *bo = iris_resource_bo(shader->assembly.res);
6007 iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
6008
6009 if (GFX_VERx10 < 125) {
6010 struct iris_bo *curbe_bo =
6011 iris_resource_bo(ice->state.last_res.cs_thread_ids);
6012 iris_use_pinned_bo(batch, curbe_bo, false, IRIS_DOMAIN_NONE);
6013 }
6014
6015 pin_scratch_space(ice, batch, shader, stage);
6016 }
6017 }
6018 }
6019
6020 /**
6021 * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
6022 */
6023 static void
iris_update_binder_address(struct iris_batch * batch,struct iris_binder * binder)6024 iris_update_binder_address(struct iris_batch *batch,
6025 struct iris_binder *binder)
6026 {
6027 if (batch->last_binder_address == binder->bo->address)
6028 return;
6029
6030 struct isl_device *isl_dev = &batch->screen->isl_dev;
6031 uint32_t mocs = isl_mocs(isl_dev, 0, false);
6032
6033 iris_batch_sync_region_start(batch);
6034
6035 #if GFX_VER >= 11
6036 /* Use 3DSTATE_BINDING_TABLE_POOL_ALLOC on Icelake and later */
6037
6038 #if GFX_VERx10 == 120
6039 /* Wa_1607854226:
6040 *
6041 * Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
6042 * mode by putting the pipeline temporarily in 3D mode..
6043 */
6044 if (batch->name == IRIS_BATCH_COMPUTE)
6045 emit_pipeline_select(batch, _3D);
6046 #endif
6047
6048 iris_emit_pipe_control_flush(batch, "Stall for binder realloc",
6049 PIPE_CONTROL_CS_STALL);
6050
6051 iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
6052 btpa.BindingTablePoolBaseAddress = ro_bo(binder->bo, 0);
6053 btpa.BindingTablePoolBufferSize = binder->size / 4096;
6054 #if GFX_VERx10 < 125
6055 btpa.BindingTablePoolEnable = true;
6056 #endif
6057 btpa.MOCS = mocs;
6058 }
6059
6060 #if GFX_VERx10 == 120
6061 /* Wa_1607854226:
6062 *
6063 * Put the pipeline back into compute mode.
6064 */
6065 if (batch->name == IRIS_BATCH_COMPUTE)
6066 emit_pipeline_select(batch, GPGPU);
6067 #endif
6068 #else
6069 /* Use STATE_BASE_ADDRESS on older platforms */
6070 flush_before_state_base_change(batch);
6071
6072 iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
6073 sba.SurfaceStateBaseAddressModifyEnable = true;
6074 sba.SurfaceStateBaseAddress = ro_bo(binder->bo, 0);
6075
6076 /* The hardware appears to pay attention to the MOCS fields even
6077 * if you don't set the "Address Modify Enable" bit for the base.
6078 */
6079 sba.GeneralStateMOCS = mocs;
6080 sba.StatelessDataPortAccessMOCS = mocs;
6081 sba.DynamicStateMOCS = mocs;
6082 sba.IndirectObjectMOCS = mocs;
6083 sba.InstructionMOCS = mocs;
6084 sba.SurfaceStateMOCS = mocs;
6085 #if GFX_VER >= 9
6086 sba.BindlessSurfaceStateMOCS = mocs;
6087 #endif
6088 #if GFX_VERx10 >= 125
6089 sba.L1CacheControl = L1CC_WB;
6090 #endif
6091 }
6092 #endif
6093
6094 flush_after_state_base_change(batch);
6095 iris_batch_sync_region_end(batch);
6096
6097 batch->last_binder_address = binder->bo->address;
6098 }
6099
6100 static inline void
iris_viewport_zmin_zmax(const struct pipe_viewport_state * vp,bool halfz,bool window_space_position,float * zmin,float * zmax)6101 iris_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
6102 bool window_space_position, float *zmin, float *zmax)
6103 {
6104 if (window_space_position) {
6105 *zmin = 0.f;
6106 *zmax = 1.f;
6107 return;
6108 }
6109 util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
6110 }
6111
6112 /* Wa_16018063123 */
6113 static inline void
batch_emit_fast_color_dummy_blit(struct iris_batch * batch)6114 batch_emit_fast_color_dummy_blit(struct iris_batch *batch)
6115 {
6116 #if GFX_VERx10 >= 125
6117 iris_emit_cmd(batch, GENX(XY_FAST_COLOR_BLT), blt) {
6118 blt.DestinationBaseAddress = batch->screen->workaround_address;
6119 blt.DestinationMOCS = iris_mocs(batch->screen->workaround_address.bo,
6120 &batch->screen->isl_dev,
6121 ISL_SURF_USAGE_BLITTER_DST_BIT);
6122 blt.DestinationPitch = 63;
6123 blt.DestinationX2 = 1;
6124 blt.DestinationY2 = 4;
6125 blt.DestinationSurfaceWidth = 1;
6126 blt.DestinationSurfaceHeight = 4;
6127 blt.DestinationSurfaceType = XY_SURFTYPE_2D;
6128 blt.DestinationSurfaceQPitch = 4;
6129 blt.DestinationTiling = XY_TILE_LINEAR;
6130 }
6131 #endif
6132 }
6133
6134 #if GFX_VER >= 12
6135 static void
invalidate_aux_map_state_per_engine(struct iris_batch * batch)6136 invalidate_aux_map_state_per_engine(struct iris_batch *batch)
6137 {
6138 uint64_t register_addr = 0;
6139
6140 switch (batch->name) {
6141 case IRIS_BATCH_RENDER: {
6142 /* HSD 1209978178: docs say that before programming the aux table:
6143 *
6144 * "Driver must ensure that the engine is IDLE but ensure it doesn't
6145 * add extra flushes in the case it knows that the engine is already
6146 * IDLE."
6147 *
6148 * An end of pipe sync is needed here, otherwise we see GPU hangs in
6149 * dEQP-GLES31.functional.copy_image.* tests.
6150 *
6151 * HSD 22012751911: SW Programming sequence when issuing aux invalidation:
6152 *
6153 * "Render target Cache Flush + L3 Fabric Flush + State Invalidation + CS Stall"
6154 *
6155 * Notice we don't set the L3 Fabric Flush here, because we have
6156 * PIPE_CONTROL_CS_STALL. The PIPE_CONTROL::L3 Fabric Flush
6157 * documentation says :
6158 *
6159 * "L3 Fabric Flush will ensure all the pending transactions in the
6160 * L3 Fabric are flushed to global observation point. HW does
6161 * implicit L3 Fabric Flush on all stalling flushes (both explicit
6162 * and implicit) and on PIPECONTROL having Post Sync Operation
6163 * enabled."
6164 *
6165 * Therefore setting L3 Fabric Flush here would be redundant.
6166 *
6167 * From Bspec 43904 (Register_CCSAuxiliaryTableInvalidate):
6168 * RCS engine idle sequence:
6169 *
6170 * Gfx125+:
6171 * PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall + Render
6172 * Target Cache Flush + Depth Cache + CCS flush
6173 *
6174 */
6175 iris_emit_end_of_pipe_sync(batch, "Invalidate aux map table",
6176 PIPE_CONTROL_CS_STALL |
6177 PIPE_CONTROL_RENDER_TARGET_FLUSH |
6178 PIPE_CONTROL_STATE_CACHE_INVALIDATE |
6179 (GFX_VERx10 == 125 ?
6180 PIPE_CONTROL_CCS_CACHE_FLUSH : 0));
6181
6182 register_addr = GENX(GFX_CCS_AUX_INV_num);
6183 break;
6184 }
6185 case IRIS_BATCH_COMPUTE: {
6186 /*
6187 * Notice we don't set the L3 Fabric Flush here, because we have
6188 * PIPE_CONTROL_CS_STALL. The PIPE_CONTROL::L3 Fabric Flush
6189 * documentation says :
6190 *
6191 * "L3 Fabric Flush will ensure all the pending transactions in the
6192 * L3 Fabric are flushed to global observation point. HW does
6193 * implicit L3 Fabric Flush on all stalling flushes (both explicit
6194 * and implicit) and on PIPECONTROL having Post Sync Operation
6195 * enabled."
6196 *
6197 * Therefore setting L3 Fabric Flush here would be redundant.
6198 *
6199 * From Bspec 43904 (Register_CCSAuxiliaryTableInvalidate):
6200 * Compute engine idle sequence:
6201 *
6202 * Gfx125+:
6203 * PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall + CCS flush
6204 */
6205 iris_emit_end_of_pipe_sync(batch, "Invalidate aux map table",
6206 PIPE_CONTROL_DATA_CACHE_FLUSH |
6207 PIPE_CONTROL_CS_STALL |
6208 (GFX_VERx10 == 125 ?
6209 PIPE_CONTROL_CCS_CACHE_FLUSH : 0));
6210
6211 register_addr = GENX(COMPCS0_CCS_AUX_INV_num);
6212 break;
6213 }
6214 case IRIS_BATCH_BLITTER: {
6215 #if GFX_VERx10 >= 125
6216 /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
6217 if (intel_needs_workaround(batch->screen->devinfo, 16018063123))
6218 batch_emit_fast_color_dummy_blit(batch);
6219
6220 /*
6221 * Notice we don't set the L3 Fabric Flush here, because we have
6222 * PIPE_CONTROL_CS_STALL. The PIPE_CONTROL::L3 Fabric Flush
6223 * documentation says :
6224 *
6225 * "L3 Fabric Flush will ensure all the pending transactions in the
6226 * L3 Fabric are flushed to global observation point. HW does
6227 * implicit L3 Fabric Flush on all stalling flushes (both explicit
6228 * and implicit) and on PIPECONTROL having Post Sync Operation
6229 * enabled."
6230 *
6231 * Therefore setting L3 Fabric Flush here would be redundant.
6232 *
6233 * From Bspec 43904 (Register_CCSAuxiliaryTableInvalidate):
6234 * Blitter engine idle sequence:
6235 *
6236 * Gfx125+:
6237 * MI_FLUSH_DW (dw0;b16 – flush CCS)
6238 */
6239 iris_emit_cmd(batch, GENX(MI_FLUSH_DW), fd) {
6240 fd.FlushCCS = true;
6241 }
6242 register_addr = GENX(BCS_CCS_AUX_INV_num);
6243 #endif
6244 break;
6245 }
6246 default:
6247 unreachable("Invalid batch for aux map invalidation");
6248 break;
6249 }
6250
6251 if (register_addr != 0) {
6252 /* If the aux-map state number increased, then we need to rewrite the
6253 * register. Rewriting the register is used to both set the aux-map
6254 * translation table address, and also to invalidate any previously
6255 * cached translations.
6256 */
6257 iris_load_register_imm32(batch, register_addr, 1);
6258
6259 /* HSD 22012751911: SW Programming sequence when issuing aux invalidation:
6260 *
6261 * "Poll Aux Invalidation bit once the invalidation is set (Register
6262 * 4208 bit 0)"
6263 */
6264 iris_emit_cmd(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
6265 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
6266 sem.WaitMode = PollingMode;
6267 sem.RegisterPollMode = true;
6268 sem.SemaphoreDataDword = 0x0;
6269 sem.SemaphoreAddress = ro_bo(NULL, register_addr);
6270 }
6271 }
6272 }
6273
6274 void
genX(invalidate_aux_map_state)6275 genX(invalidate_aux_map_state)(struct iris_batch *batch)
6276 {
6277 struct iris_screen *screen = batch->screen;
6278 void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr);
6279 if (!aux_map_ctx)
6280 return;
6281 uint32_t aux_map_state_num = intel_aux_map_get_state_num(aux_map_ctx);
6282 if (batch->last_aux_map_state != aux_map_state_num) {
6283 invalidate_aux_map_state_per_engine(batch);
6284 batch->last_aux_map_state = aux_map_state_num;
6285 }
6286 }
6287
6288 static void
init_aux_map_state(struct iris_batch * batch)6289 init_aux_map_state(struct iris_batch *batch)
6290 {
6291 struct iris_screen *screen = batch->screen;
6292 void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr);
6293 if (!aux_map_ctx)
6294 return;
6295
6296 uint64_t base_addr = intel_aux_map_get_base(aux_map_ctx);
6297 assert(base_addr != 0 && align64(base_addr, 32 * 1024) == base_addr);
6298
6299 uint32_t reg = 0;
6300 switch (batch->name) {
6301 case IRIS_BATCH_COMPUTE:
6302 if (iris_bufmgr_compute_engine_supported(screen->bufmgr)) {
6303 reg = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num);
6304 break;
6305 }
6306 /* fallthrough */
6307 FALLTHROUGH;
6308 case IRIS_BATCH_RENDER:
6309 reg = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
6310 break;
6311 case IRIS_BATCH_BLITTER:
6312 #if GFX_VERx10 >= 125
6313 reg = GENX(BCS_AUX_TABLE_BASE_ADDR_num);
6314 #endif
6315 break;
6316 default:
6317 unreachable("Invalid batch for aux map init.");
6318 }
6319
6320 if (reg)
6321 iris_load_register_imm64(batch, reg, base_addr);
6322 }
6323 #endif
6324
6325 struct push_bos {
6326 struct {
6327 struct iris_address addr;
6328 uint32_t length;
6329 } buffers[4];
6330 int buffer_count;
6331 uint32_t max_length;
6332 };
6333
6334 static void
setup_constant_buffers(struct iris_context * ice,struct iris_batch * batch,int stage,struct push_bos * push_bos)6335 setup_constant_buffers(struct iris_context *ice,
6336 struct iris_batch *batch,
6337 int stage,
6338 struct push_bos *push_bos)
6339 {
6340 struct iris_shader_state *shs = &ice->state.shaders[stage];
6341 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
6342
6343 uint32_t push_range_sum = 0;
6344
6345 int n = 0;
6346 for (int i = 0; i < 4; i++) {
6347 const struct iris_ubo_range *range = &shader->ubo_ranges[i];
6348
6349 if (range->length == 0)
6350 continue;
6351
6352 push_range_sum += range->length;
6353
6354 if (range->length > push_bos->max_length)
6355 push_bos->max_length = range->length;
6356
6357 /* Range block is a binding table index, map back to UBO index. */
6358 unsigned block_index = iris_bti_to_group_index(
6359 &shader->bt, IRIS_SURFACE_GROUP_UBO, range->block);
6360 assert(block_index != IRIS_SURFACE_NOT_USED);
6361
6362 struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index];
6363 struct iris_resource *res = (void *) cbuf->buffer;
6364
6365 assert(cbuf->buffer_offset % 32 == 0);
6366
6367 if (res)
6368 iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_OTHER_READ);
6369
6370 push_bos->buffers[n].length = range->length;
6371 push_bos->buffers[n].addr =
6372 res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
6373 : batch->screen->workaround_address;
6374 n++;
6375 }
6376
6377 /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
6378 *
6379 * "The sum of all four read length fields must be less than or
6380 * equal to the size of 64."
6381 */
6382 assert(push_range_sum <= 64);
6383
6384 push_bos->buffer_count = n;
6385 }
6386
6387 static void
emit_push_constant_packets(struct iris_context * ice,struct iris_batch * batch,int stage,const struct push_bos * push_bos)6388 emit_push_constant_packets(struct iris_context *ice,
6389 struct iris_batch *batch,
6390 int stage,
6391 const struct push_bos *push_bos)
6392 {
6393 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
6394
6395 iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
6396 pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
6397
6398 #if GFX_VER >= 9
6399 pkt.MOCS = isl_mocs(isl_dev, 0, false);
6400 #endif
6401
6402 /* The Skylake PRM contains the following restriction:
6403 *
6404 * "The driver must ensure The following case does not occur
6405 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
6406 * buffer 3 read length equal to zero committed followed by a
6407 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
6408 * zero committed."
6409 *
6410 * To avoid this, we program the buffers in the highest slots.
6411 * This way, slot 0 is only used if slot 3 is also used.
6412 */
6413 const int n = push_bos->buffer_count;
6414 assert(n <= 4);
6415 const unsigned shift = 4 - n;
6416 for (int i = 0; i < n; i++) {
6417 pkt.ConstantBody.ReadLength[i + shift] =
6418 push_bos->buffers[i].length;
6419 pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
6420 }
6421 }
6422 }
6423
6424 #if GFX_VER >= 12
6425 static void
emit_null_push_constant_tbimr_workaround(struct iris_batch * batch)6426 emit_null_push_constant_tbimr_workaround(struct iris_batch *batch)
6427 {
6428 struct isl_device *isl_dev = &batch->screen->isl_dev;
6429 /* Pass a single-register push constant payload for the PS
6430 * stage even if empty, since PS invocations with zero push
6431 * constant cycles have been found to cause hangs with TBIMR
6432 * enabled. See HSDES #22020184996.
6433 *
6434 * XXX - Use workaround infrastructure and final workaround
6435 * when provided by hardware team.
6436 */
6437 const struct iris_address null_addr = {
6438 .bo = batch->screen->workaround_bo,
6439 .offset = 1024,
6440 };
6441 const uint32_t num_dwords = 2 + 2 * 1;
6442 uint32_t const_all[num_dwords];
6443 uint32_t *dw = &const_all[0];
6444
6445 iris_pack_command(GENX(3DSTATE_CONSTANT_ALL), dw, all) {
6446 all.DWordLength = num_dwords - 2;
6447 all.MOCS = isl_mocs(isl_dev, 0, false);
6448 all.ShaderUpdateEnable = (1 << MESA_SHADER_FRAGMENT);
6449 all.PointerBufferMask = 1;
6450 }
6451 dw += 2;
6452
6453 _iris_pack_state(batch, GENX(3DSTATE_CONSTANT_ALL_DATA), dw, data) {
6454 data.PointerToConstantBuffer = null_addr;
6455 data.ConstantBufferReadLength = 1;
6456 }
6457
6458 iris_batch_emit(batch, const_all, sizeof(uint32_t) * num_dwords);
6459 }
6460
6461 static void
emit_push_constant_packet_all(struct iris_context * ice,struct iris_batch * batch,uint32_t shader_mask,const struct push_bos * push_bos)6462 emit_push_constant_packet_all(struct iris_context *ice,
6463 struct iris_batch *batch,
6464 uint32_t shader_mask,
6465 const struct push_bos *push_bos)
6466 {
6467 struct isl_device *isl_dev = &batch->screen->isl_dev;
6468
6469 if (!push_bos) {
6470 if (batch->screen->devinfo->needs_null_push_constant_tbimr_workaround &&
6471 (shader_mask & (1 << MESA_SHADER_FRAGMENT))) {
6472 emit_null_push_constant_tbimr_workaround(batch);
6473 shader_mask &= ~(1 << MESA_SHADER_FRAGMENT);
6474 }
6475
6476 if (shader_mask) {
6477 iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_ALL), pc) {
6478 pc.ShaderUpdateEnable = shader_mask;
6479 pc.MOCS = iris_mocs(NULL, isl_dev, 0);
6480 }
6481 }
6482 return;
6483 }
6484
6485 const uint32_t n = push_bos->buffer_count;
6486 const uint32_t max_pointers = 4;
6487 const uint32_t num_dwords = 2 + 2 * n;
6488 uint32_t const_all[2 + 2 * max_pointers];
6489 uint32_t *dw = &const_all[0];
6490
6491 assert(n <= max_pointers);
6492 iris_pack_command(GENX(3DSTATE_CONSTANT_ALL), dw, all) {
6493 all.DWordLength = num_dwords - 2;
6494 all.MOCS = isl_mocs(isl_dev, 0, false);
6495 all.ShaderUpdateEnable = shader_mask;
6496 all.PointerBufferMask = (1 << n) - 1;
6497 }
6498 dw += 2;
6499
6500 for (int i = 0; i < n; i++) {
6501 _iris_pack_state(batch, GENX(3DSTATE_CONSTANT_ALL_DATA),
6502 dw + i * 2, data) {
6503 data.PointerToConstantBuffer = push_bos->buffers[i].addr;
6504 data.ConstantBufferReadLength = push_bos->buffers[i].length;
6505 }
6506 }
6507 iris_batch_emit(batch, const_all, sizeof(uint32_t) * num_dwords);
6508 }
6509 #endif
6510
6511 void
genX(emit_depth_state_workarounds)6512 genX(emit_depth_state_workarounds)(struct iris_context *ice,
6513 struct iris_batch *batch,
6514 const struct isl_surf *surf)
6515 {
6516 #if INTEL_NEEDS_WA_1808121037
6517 const bool is_d16_1x_msaa = surf->format == ISL_FORMAT_R16_UNORM &&
6518 surf->samples == 1;
6519
6520 switch (ice->state.genx->depth_reg_mode) {
6521 case IRIS_DEPTH_REG_MODE_HW_DEFAULT:
6522 if (!is_d16_1x_msaa)
6523 return;
6524 break;
6525 case IRIS_DEPTH_REG_MODE_D16_1X_MSAA:
6526 if (is_d16_1x_msaa)
6527 return;
6528 break;
6529 case IRIS_DEPTH_REG_MODE_UNKNOWN:
6530 break;
6531 }
6532
6533 /* We'll change some CHICKEN registers depending on the depth surface
6534 * format. Do a depth flush and stall so the pipeline is not using these
6535 * settings while we change the registers.
6536 */
6537 iris_emit_end_of_pipe_sync(batch,
6538 "Workaround: Stop pipeline for Wa_1808121037",
6539 PIPE_CONTROL_DEPTH_STALL |
6540 PIPE_CONTROL_DEPTH_CACHE_FLUSH);
6541
6542 /* Wa_1808121037
6543 *
6544 * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
6545 * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
6546 */
6547 iris_emit_reg(batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
6548 reg.HIZPlaneOptimizationdisablebit = is_d16_1x_msaa;
6549 reg.HIZPlaneOptimizationdisablebitMask = true;
6550 }
6551
6552 ice->state.genx->depth_reg_mode =
6553 is_d16_1x_msaa ? IRIS_DEPTH_REG_MODE_D16_1X_MSAA :
6554 IRIS_DEPTH_REG_MODE_HW_DEFAULT;
6555 #endif
6556 }
6557
6558 /* Calculate TBIMR tiling parameters adequate for the current pipeline
6559 * setup. Return true if TBIMR should be enabled.
6560 */
6561 UNUSED static bool
calculate_tile_dimensions(struct iris_context * ice,unsigned * tile_width,unsigned * tile_height)6562 calculate_tile_dimensions(struct iris_context *ice,
6563 unsigned *tile_width, unsigned *tile_height)
6564 {
6565 struct iris_screen *screen = (void *)ice->ctx.screen;
6566 const struct intel_device_info *devinfo = screen->devinfo;
6567
6568 assert(GFX_VER == 12);
6569 const unsigned aux_scale = ISL_MAIN_TO_CCS_SIZE_RATIO_XE;
6570
6571 /* Perform a rough calculation of the tile cache footprint of the
6572 * pixel pipeline, approximating it as the sum of the amount of
6573 * memory used per pixel by every render target, depth, stencil and
6574 * auxiliary surfaces bound to the pipeline.
6575 */
6576 unsigned pixel_size = 0;
6577
6578 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
6579
6580 if (cso->width == 0 || cso->height == 0)
6581 return false;
6582
6583 for (unsigned i = 0; i < cso->nr_cbufs; i++) {
6584 const struct iris_surface *surf = (void *)cso->cbufs[i];
6585
6586 if (surf) {
6587 const struct iris_resource *res = (void *)surf->base.texture;
6588
6589 pixel_size += intel_calculate_surface_pixel_size(&res->surf);
6590
6591 /* XXX - Pessimistic, in some cases it might be helpful to neglect
6592 * aux surface traffic.
6593 */
6594 if (ice->state.draw_aux_usage[i]) {
6595 pixel_size += intel_calculate_surface_pixel_size(&res->aux.surf);
6596
6597 if (isl_aux_usage_has_ccs(res->aux.usage)) {
6598 pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
6599 &res->surf), aux_scale);
6600 }
6601 }
6602 }
6603 }
6604
6605 if (cso->zsbuf) {
6606 struct iris_resource *zres;
6607 struct iris_resource *sres;
6608 iris_get_depth_stencil_resources(cso->zsbuf->texture, &zres, &sres);
6609
6610 if (zres) {
6611 pixel_size += intel_calculate_surface_pixel_size(&zres->surf);
6612
6613 /* XXX - Pessimistic, in some cases it might be helpful to neglect
6614 * aux surface traffic.
6615 */
6616 if (iris_resource_level_has_hiz(devinfo, zres, cso->zsbuf->u.tex.level)) {
6617 pixel_size += intel_calculate_surface_pixel_size(&zres->aux.surf);
6618
6619 if (isl_aux_usage_has_ccs(zres->aux.usage)) {
6620 pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
6621 &zres->surf), aux_scale);
6622 }
6623 }
6624 }
6625
6626 if (sres) {
6627 pixel_size += intel_calculate_surface_pixel_size(&sres->surf);
6628 }
6629 }
6630
6631 /* Compute a tile layout that allows reasonable utilization of the
6632 * tile cache based on the per-pixel cache footprint estimated
6633 * above.
6634 */
6635 intel_calculate_tile_dimensions(devinfo, screen->l3_config_3d,
6636 32, 32, cso->width, cso->height, pixel_size,
6637 tile_width, tile_height);
6638
6639 /* Perform TBIMR tile passes only if the framebuffer covers more
6640 * than a single tile.
6641 */
6642 return *tile_width < cso->width || *tile_height < cso->height;
6643 }
6644
6645 static void
iris_preemption_streamout_wa(struct iris_context * ice,struct iris_batch * batch,bool enable)6646 iris_preemption_streamout_wa(struct iris_context *ice,
6647 struct iris_batch *batch,
6648 bool enable)
6649 {
6650 #if GFX_VERx10 >= 120
6651 if (!intel_needs_workaround(batch->screen->devinfo, 16013994831))
6652 return;
6653
6654 iris_emit_reg(batch, GENX(CS_CHICKEN1), reg) {
6655 reg.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = !enable;
6656 reg.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
6657 }
6658
6659 /* Emit CS_STALL and 250 noops. */
6660 iris_emit_pipe_control_flush(batch, "workaround: Wa_16013994831",
6661 PIPE_CONTROL_CS_STALL);
6662 for (unsigned i = 0; i < 250; i++)
6663 iris_emit_cmd(batch, GENX(MI_NOOP), noop);
6664
6665 ice->state.genx->object_preemption = enable;
6666 #endif
6667 }
6668
6669 static void
shader_program_uses_primitive_id(struct iris_context * ice,struct iris_batch * batch,struct iris_compiled_shader * shader,gl_shader_stage stage,bool * uses_primitive_id)6670 shader_program_uses_primitive_id(struct iris_context *ice,
6671 struct iris_batch *batch,
6672 struct iris_compiled_shader *shader,
6673 gl_shader_stage stage,
6674 bool *uses_primitive_id)
6675 {
6676 switch (stage) {
6677 case MESA_SHADER_TESS_CTRL: {
6678 struct iris_tcs_data *tcs_data = iris_tcs_data(shader);
6679 *uses_primitive_id |= tcs_data->include_primitive_id;
6680 break;
6681 }
6682 case MESA_SHADER_TESS_EVAL: {
6683 struct iris_tes_data *tes_data = iris_tes_data(shader);
6684 *uses_primitive_id |= tes_data->include_primitive_id;
6685 break;
6686 }
6687 default:
6688 break;
6689 }
6690
6691 struct iris_compiled_shader *gs_shader =
6692 ice->shaders.prog[MESA_SHADER_GEOMETRY];
6693 const struct iris_gs_data *gs_data =
6694 gs_shader ? iris_gs_data(gs_shader) : NULL;
6695
6696 *uses_primitive_id |= gs_data && gs_data->include_primitive_id;
6697 }
6698
6699 static void
emit_wa_18020335297_dummy_draw(struct iris_batch * batch)6700 emit_wa_18020335297_dummy_draw(struct iris_batch *batch)
6701 {
6702 #if GFX_VERx10 >= 125
6703 iris_emit_cmd(batch, GENX(3DSTATE_VFG), vfg) {
6704 vfg.DistributionMode = RR_STRICT;
6705 }
6706 iris_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
6707 vf.GeometryDistributionEnable = true;
6708 }
6709 #endif
6710
6711 #if GFX_VER >= 12
6712 iris_emit_cmd(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
6713 pr.ReplicaMask = 1;
6714 }
6715 #endif
6716
6717 iris_emit_cmd(batch, GENX(3DSTATE_RASTER), rr) {
6718 rr.CullMode = CULLMODE_NONE;
6719 rr.FrontFaceFillMode = FILL_MODE_SOLID;
6720 rr.BackFaceFillMode = FILL_MODE_SOLID;
6721 }
6722
6723 iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) { }
6724 iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgvs) { }
6725
6726 #if GFX_VER >= 11
6727 iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS_2), sgvs2) { }
6728 #endif
6729
6730 iris_emit_cmd(batch, GENX(3DSTATE_CLIP), clip) {
6731 clip.ClipEnable = true;
6732 clip.ClipMode = CLIPMODE_REJECT_ALL;
6733 }
6734
6735 iris_emit_cmd(batch, GENX(3DSTATE_VS), vs) { }
6736 iris_emit_cmd(batch, GENX(3DSTATE_GS), gs) { }
6737 iris_emit_cmd(batch, GENX(3DSTATE_HS), hs) { }
6738 iris_emit_cmd(batch, GENX(3DSTATE_TE), te) { }
6739 iris_emit_cmd(batch, GENX(3DSTATE_DS), ds) { }
6740 iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), so) { }
6741
6742 uint32_t vertex_elements[1 + 2 * GENX(VERTEX_ELEMENT_STATE_length)];
6743 uint32_t *ve_pack_dest = &vertex_elements[1];
6744
6745 iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), vertex_elements, ve) {
6746 ve.DWordLength = 1 + GENX(VERTEX_ELEMENT_STATE_length) * 2 -
6747 GENX(3DSTATE_VERTEX_ELEMENTS_length_bias);
6748 }
6749
6750 for (int i = 0; i < 2; i++) {
6751 iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
6752 ve.Valid = true;
6753 ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
6754 ve.Component0Control = VFCOMP_STORE_0;
6755 ve.Component1Control = VFCOMP_STORE_0;
6756 ve.Component2Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP;
6757 ve.Component3Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP;
6758 }
6759 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
6760 }
6761
6762 iris_batch_emit(batch, vertex_elements, sizeof(uint32_t) *
6763 (1 + 2 * GENX(VERTEX_ELEMENT_STATE_length)));
6764
6765 iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
6766 topo.PrimitiveTopologyType = _3DPRIM_TRILIST;
6767 }
6768
6769 /* Emit dummy draw per slice. */
6770 for (unsigned i = 0; i < batch->screen->devinfo->num_slices; i++) {
6771 iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
6772 prim.VertexCountPerInstance = 3;
6773 prim.PrimitiveTopologyType = _3DPRIM_TRILIST;
6774 prim.InstanceCount = 1;
6775 prim.VertexAccessType = SEQUENTIAL;
6776 }
6777 }
6778 }
6779
6780 static void
iris_upload_dirty_render_state(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw,bool skip_vb_params)6781 iris_upload_dirty_render_state(struct iris_context *ice,
6782 struct iris_batch *batch,
6783 const struct pipe_draw_info *draw,
6784 bool skip_vb_params)
6785 {
6786 struct iris_screen *screen = batch->screen;
6787 struct iris_border_color_pool *border_color_pool =
6788 iris_bufmgr_get_border_color_pool(screen->bufmgr);
6789
6790 /* Re-emit 3DSTATE_DS before any 3DPRIMITIVE when tessellation is on */
6791 if (intel_needs_workaround(batch->screen->devinfo, 22018402687) &&
6792 ice->shaders.prog[MESA_SHADER_TESS_EVAL])
6793 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TES;
6794
6795 uint64_t dirty = ice->state.dirty;
6796 uint64_t stage_dirty = ice->state.stage_dirty;
6797
6798 if (!(dirty & IRIS_ALL_DIRTY_FOR_RENDER) &&
6799 !(stage_dirty & IRIS_ALL_STAGE_DIRTY_FOR_RENDER))
6800 return;
6801
6802 struct iris_genx_state *genx = ice->state.genx;
6803 struct iris_binder *binder = &ice->state.binder;
6804 struct iris_fs_data *fs_data =
6805 iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
6806
6807 /* When MSAA is enabled, instead of using BLENDFACTOR_ZERO use
6808 * CONST_COLOR, CONST_ALPHA and supply zero by using blend constants.
6809 */
6810 bool needs_wa_14018912822 =
6811 screen->driconf.intel_enable_wa_14018912822 &&
6812 intel_needs_workaround(batch->screen->devinfo, 14018912822) &&
6813 util_framebuffer_get_num_samples(&ice->state.framebuffer) > 1;
6814
6815 if (dirty & IRIS_DIRTY_CC_VIEWPORT) {
6816 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
6817 uint32_t cc_vp_address;
6818 bool wa_18020335297_applied = false;
6819
6820 /* Wa_18020335297 - Apply the WA when viewport ptr is reprogrammed. */
6821 if (intel_needs_workaround(screen->devinfo, 18020335297) &&
6822 batch->name == IRIS_BATCH_RENDER &&
6823 ice->state.viewport_ptr_set) {
6824 emit_wa_18020335297_dummy_draw(batch);
6825 wa_18020335297_applied = true;
6826 }
6827
6828 /* XXX: could avoid streaming for depth_clip [0,1] case. */
6829 uint32_t *cc_vp_map =
6830 stream_state(batch, ice->state.dynamic_uploader,
6831 &ice->state.last_res.cc_vp,
6832 4 * ice->state.num_viewports *
6833 GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
6834 for (int i = 0; i < ice->state.num_viewports; i++) {
6835 float zmin, zmax;
6836 iris_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->clip_halfz,
6837 ice->state.window_space_position,
6838 &zmin, &zmax);
6839 if (cso_rast->depth_clip_near)
6840 zmin = 0.0;
6841 if (cso_rast->depth_clip_far)
6842 zmax = 1.0;
6843
6844 iris_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
6845 ccv.MinimumDepth = zmin;
6846 ccv.MaximumDepth = zmax;
6847 }
6848
6849 cc_vp_map += GENX(CC_VIEWPORT_length);
6850 }
6851
6852 iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
6853 ptr.CCViewportPointer = cc_vp_address;
6854 }
6855
6856 if (wa_18020335297_applied) {
6857 #if GFX_VER >= 12
6858 iris_emit_cmd(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) { }
6859 #endif
6860 /* Dirty all emitted WA state to make sure that current real
6861 * state is restored.
6862 */
6863 dirty |= IRIS_DIRTY_VFG |
6864 IRIS_DIRTY_VF |
6865 IRIS_DIRTY_RASTER |
6866 IRIS_DIRTY_VF_STATISTICS |
6867 IRIS_DIRTY_VF_SGVS |
6868 IRIS_DIRTY_CLIP |
6869 IRIS_DIRTY_STREAMOUT |
6870 IRIS_DIRTY_VERTEX_ELEMENTS |
6871 IRIS_DIRTY_VF_TOPOLOGY;
6872
6873 for (int stage = 0; stage < MESA_SHADER_FRAGMENT; stage++) {
6874 if (ice->shaders.prog[stage])
6875 stage_dirty |= (IRIS_STAGE_DIRTY_VS << stage);
6876 }
6877 }
6878 ice->state.viewport_ptr_set = true;
6879 }
6880
6881 if (dirty & IRIS_DIRTY_SF_CL_VIEWPORT) {
6882 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6883 uint32_t sf_cl_vp_address;
6884 uint32_t *vp_map =
6885 stream_state(batch, ice->state.dynamic_uploader,
6886 &ice->state.last_res.sf_cl_vp,
6887 4 * ice->state.num_viewports *
6888 GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
6889
6890 for (unsigned i = 0; i < ice->state.num_viewports; i++) {
6891 const struct pipe_viewport_state *state = &ice->state.viewports[i];
6892 float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
6893
6894 float vp_xmin = viewport_extent(state, 0, -1.0f);
6895 float vp_xmax = viewport_extent(state, 0, 1.0f);
6896 float vp_ymin = viewport_extent(state, 1, -1.0f);
6897 float vp_ymax = viewport_extent(state, 1, 1.0f);
6898
6899 intel_calculate_guardband_size(0, cso_fb->width, 0, cso_fb->height,
6900 state->scale[0], state->scale[1],
6901 state->translate[0], state->translate[1],
6902 &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
6903
6904 iris_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp) {
6905 vp.ViewportMatrixElementm00 = state->scale[0];
6906 vp.ViewportMatrixElementm11 = state->scale[1];
6907 vp.ViewportMatrixElementm22 = state->scale[2];
6908 vp.ViewportMatrixElementm30 = state->translate[0];
6909 vp.ViewportMatrixElementm31 = state->translate[1];
6910 vp.ViewportMatrixElementm32 = state->translate[2];
6911 vp.XMinClipGuardband = gb_xmin;
6912 vp.XMaxClipGuardband = gb_xmax;
6913 vp.YMinClipGuardband = gb_ymin;
6914 vp.YMaxClipGuardband = gb_ymax;
6915 vp.XMinViewPort = MAX2(vp_xmin, 0);
6916 vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1;
6917 vp.YMinViewPort = MAX2(vp_ymin, 0);
6918 vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1;
6919 }
6920
6921 vp_map += GENX(SF_CLIP_VIEWPORT_length);
6922 }
6923
6924 iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
6925 ptr.SFClipViewportPointer = sf_cl_vp_address;
6926 }
6927 }
6928
6929 if (dirty & IRIS_DIRTY_URB) {
6930 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6931 if (!ice->shaders.prog[i]) {
6932 ice->shaders.urb.cfg.size[i] = 1;
6933 } else {
6934 struct iris_vue_data *vue_data =
6935 iris_vue_data(ice->shaders.prog[i]);
6936 ice->shaders.urb.cfg.size[i] = vue_data->urb_entry_size;
6937 }
6938 assert(ice->shaders.urb.cfg.size[i] != 0);
6939 }
6940
6941 genX(emit_urb_config)(batch,
6942 ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL,
6943 ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL);
6944 }
6945
6946 if (dirty & IRIS_DIRTY_BLEND_STATE) {
6947 struct iris_blend_state *cso_blend = ice->state.cso_blend;
6948 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6949 struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
6950
6951 bool color_blend_zero = false;
6952 bool alpha_blend_zero = false;
6953
6954 /* Always write at least one BLEND_STATE - the final RT message will
6955 * reference BLEND_STATE[0] even if there aren't color writes. There
6956 * may still be alpha testing, computed depth, and so on.
6957 */
6958 const int rt_dwords =
6959 MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
6960
6961 uint32_t blend_offset;
6962 uint32_t *blend_map =
6963 stream_state(batch, ice->state.dynamic_uploader,
6964 &ice->state.last_res.blend,
6965 96, 64, &blend_offset);
6966
6967 /* Copy of blend entries for merging dynamic changes. */
6968 uint32_t blend_entries[4 * rt_dwords];
6969 memcpy(blend_entries, &cso_blend->blend_state[1], sizeof(blend_entries));
6970
6971 unsigned cbufs = MAX2(cso_fb->nr_cbufs, 1);
6972
6973 uint32_t *blend_entry = blend_entries;
6974 for (unsigned i = 0; i < cbufs; i++) {
6975 int dst_blend_factor = cso_blend->ps_dst_blend_factor[i];
6976 int dst_alpha_blend_factor = cso_blend->ps_dst_alpha_blend_factor[i];
6977 uint32_t entry[GENX(BLEND_STATE_ENTRY_length)];
6978 iris_pack_state(GENX(BLEND_STATE_ENTRY), entry, be) {
6979 if (needs_wa_14018912822) {
6980 if (dst_blend_factor == BLENDFACTOR_ZERO) {
6981 dst_blend_factor = BLENDFACTOR_CONST_COLOR;
6982 color_blend_zero = true;
6983 }
6984 if (dst_alpha_blend_factor == BLENDFACTOR_ZERO) {
6985 dst_alpha_blend_factor = BLENDFACTOR_CONST_ALPHA;
6986 alpha_blend_zero = true;
6987 }
6988 }
6989 be.DestinationBlendFactor = dst_blend_factor;
6990 be.DestinationAlphaBlendFactor = dst_alpha_blend_factor;
6991 }
6992
6993 /* Merge entry. */
6994 uint32_t *dst = blend_entry;
6995 uint32_t *src = entry;
6996 for (unsigned j = 0; j < GENX(BLEND_STATE_ENTRY_length); j++)
6997 *dst |= *src;
6998
6999 blend_entry += GENX(BLEND_STATE_ENTRY_length);
7000 }
7001
7002 /* Blend constants modified for Wa_14018912822. */
7003 if (ice->state.color_blend_zero != color_blend_zero) {
7004 ice->state.color_blend_zero = color_blend_zero;
7005 ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
7006 }
7007 if (ice->state.alpha_blend_zero != alpha_blend_zero) {
7008 ice->state.alpha_blend_zero = alpha_blend_zero;
7009 ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
7010 }
7011
7012 uint32_t blend_state_header;
7013 iris_pack_state(GENX(BLEND_STATE), &blend_state_header, bs) {
7014 bs.AlphaTestEnable = cso_zsa->alpha_enabled;
7015 bs.AlphaTestFunction = translate_compare_func(cso_zsa->alpha_func);
7016 }
7017
7018 blend_map[0] = blend_state_header | cso_blend->blend_state[0];
7019 memcpy(&blend_map[1], blend_entries, 4 * rt_dwords);
7020
7021 iris_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
7022 ptr.BlendStatePointer = blend_offset;
7023 ptr.BlendStatePointerValid = true;
7024 }
7025 }
7026
7027 if (dirty & IRIS_DIRTY_COLOR_CALC_STATE) {
7028 struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
7029 #if GFX_VER == 8
7030 struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
7031 #endif
7032 uint32_t cc_offset;
7033 void *cc_map =
7034 stream_state(batch, ice->state.dynamic_uploader,
7035 &ice->state.last_res.color_calc,
7036 sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
7037 64, &cc_offset);
7038 iris_pack_state(GENX(COLOR_CALC_STATE), cc_map, cc) {
7039 cc.AlphaTestFormat = ALPHATEST_FLOAT32;
7040 cc.AlphaReferenceValueAsFLOAT32 = cso->alpha_ref_value;
7041 cc.BlendConstantColorRed = ice->state.color_blend_zero ?
7042 0.0 : ice->state.blend_color.color[0];
7043 cc.BlendConstantColorGreen = ice->state.color_blend_zero ?
7044 0.0 : ice->state.blend_color.color[1];
7045 cc.BlendConstantColorBlue = ice->state.color_blend_zero ?
7046 0.0 : ice->state.blend_color.color[2];
7047 cc.BlendConstantColorAlpha = ice->state.alpha_blend_zero ?
7048 0.0 : ice->state.blend_color.color[3];
7049 #if GFX_VER == 8
7050 cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
7051 cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
7052 #endif
7053 }
7054 iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
7055 ptr.ColorCalcStatePointer = cc_offset;
7056 ptr.ColorCalcStatePointerValid = true;
7057 }
7058 }
7059
7060 #if GFX_VERx10 == 125
7061 if (dirty & (IRIS_DIRTY_RENDER_BUFFER | IRIS_DIRTY_DEPTH_BUFFER)) {
7062 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7063 unsigned tile_width, tile_height;
7064
7065 ice->state.use_tbimr = batch->screen->driconf.enable_tbimr &&
7066 calculate_tile_dimensions(ice, &tile_width, &tile_height);
7067
7068 if (ice->state.use_tbimr) {
7069 /* Use a batch size of 128 polygons per slice as recommended
7070 * by BSpec 68436 "TBIMR Programming".
7071 */
7072 const unsigned num_slices = screen->devinfo->num_slices;
7073 const unsigned batch_size = DIV_ROUND_UP(num_slices, 2) * 256;
7074
7075 iris_emit_cmd(batch, GENX(3DSTATE_TBIMR_TILE_PASS_INFO), tbimr) {
7076 tbimr.TileRectangleHeight = tile_height;
7077 tbimr.TileRectangleWidth = tile_width;
7078 tbimr.VerticalTileCount = DIV_ROUND_UP(cso_fb->height, tile_height);
7079 tbimr.HorizontalTileCount = DIV_ROUND_UP(cso_fb->width, tile_width);
7080 tbimr.TBIMRBatchSize = util_logbase2(batch_size) - 5;
7081 tbimr.TileBoxCheck = true;
7082 }
7083 }
7084 }
7085 #endif
7086
7087 /* Wa_1604061319
7088 *
7089 * 3DSTATE_CONSTANT_* needs to be programmed before BTP_*
7090 *
7091 * Testing shows that all the 3DSTATE_CONSTANT_XS need to be emitted if
7092 * any stage has a dirty binding table.
7093 */
7094 const bool emit_const_wa = GFX_VER >= 11 &&
7095 ((dirty & IRIS_DIRTY_RENDER_BUFFER) ||
7096 (stage_dirty & IRIS_ALL_STAGE_DIRTY_BINDINGS_FOR_RENDER));
7097
7098 #if GFX_VER >= 12
7099 uint32_t nobuffer_stages = 0;
7100 #endif
7101
7102 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7103 if (!(stage_dirty & (IRIS_STAGE_DIRTY_CONSTANTS_VS << stage)) &&
7104 !emit_const_wa)
7105 continue;
7106
7107 struct iris_shader_state *shs = &ice->state.shaders[stage];
7108 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
7109
7110 if (!shader)
7111 continue;
7112
7113 if (shs->sysvals_need_upload)
7114 upload_sysvals(ice, stage, NULL);
7115
7116 struct push_bos push_bos = {};
7117 setup_constant_buffers(ice, batch, stage, &push_bos);
7118
7119 #if GFX_VER >= 12
7120 /* If this stage doesn't have any push constants, emit it later in a
7121 * single CONSTANT_ALL packet with all the other stages.
7122 */
7123 if (push_bos.buffer_count == 0) {
7124 nobuffer_stages |= 1 << stage;
7125 continue;
7126 }
7127
7128 /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
7129 * contains only 5 bits, so we can only use it for buffers smaller than
7130 * 32.
7131 *
7132 * According to Wa_16011448509, Gfx12.0 misinterprets some address bits
7133 * in 3DSTATE_CONSTANT_ALL. It should still be safe to use the command
7134 * for disabling stages, where all address bits are zero. However, we
7135 * can't safely use it for general buffers with arbitrary addresses.
7136 * Just fall back to the individual 3DSTATE_CONSTANT_XS commands in that
7137 * case.
7138 */
7139 if (push_bos.max_length < 32 && GFX_VERx10 > 120) {
7140 emit_push_constant_packet_all(ice, batch, 1 << stage, &push_bos);
7141 continue;
7142 }
7143 #endif
7144 emit_push_constant_packets(ice, batch, stage, &push_bos);
7145 }
7146
7147 #if GFX_VER >= 12
7148 if (nobuffer_stages)
7149 /* Wa_16011448509: all address bits are zero */
7150 emit_push_constant_packet_all(ice, batch, nobuffer_stages, NULL);
7151 #endif
7152
7153 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7154 /* Gfx9 requires 3DSTATE_BINDING_TABLE_POINTERS_XS to be re-emitted
7155 * in order to commit constants. TODO: Investigate "Disable Gather
7156 * at Set Shader" to go back to legacy mode...
7157 */
7158 if (stage_dirty & ((IRIS_STAGE_DIRTY_BINDINGS_VS |
7159 (GFX_VER == 9 ? IRIS_STAGE_DIRTY_CONSTANTS_VS : 0))
7160 << stage)) {
7161 iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
7162 ptr._3DCommandSubOpcode = 38 + stage;
7163 ptr.PointertoVSBindingTable =
7164 binder->bt_offset[stage] >> IRIS_BT_OFFSET_SHIFT;
7165 }
7166 }
7167 }
7168
7169 if (GFX_VER >= 11 && (dirty & IRIS_DIRTY_RENDER_BUFFER)) {
7170 // XXX: we may want to flag IRIS_DIRTY_MULTISAMPLE (or SAMPLE_MASK?)
7171 // XXX: see commit 979fc1bc9bcc64027ff2cfafd285676f31b930a6
7172
7173 /* The PIPE_CONTROL command description says:
7174 *
7175 * "Whenever a Binding Table Index (BTI) used by a Render Target
7176 * Message points to a different RENDER_SURFACE_STATE, SW must issue a
7177 * Render Target Cache Flush by enabling this bit. When render target
7178 * flush is set due to new association of BTI, PS Scoreboard Stall bit
7179 * must be set in this packet."
7180 */
7181 // XXX: does this need to happen at 3DSTATE_BTP_PS time?
7182 iris_emit_pipe_control_flush(batch, "workaround: RT BTI change [draw]",
7183 PIPE_CONTROL_RENDER_TARGET_FLUSH |
7184 PIPE_CONTROL_STALL_AT_SCOREBOARD);
7185 }
7186
7187 if (dirty & IRIS_DIRTY_RENDER_BUFFER)
7188 trace_framebuffer_state(&batch->trace, NULL, &ice->state.framebuffer);
7189
7190 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7191 if (stage_dirty & (IRIS_STAGE_DIRTY_BINDINGS_VS << stage)) {
7192 iris_populate_binding_table(ice, batch, stage, false);
7193 }
7194 }
7195
7196 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7197 if (!(stage_dirty & (IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
7198 !ice->shaders.prog[stage])
7199 continue;
7200
7201 iris_upload_sampler_states(ice, stage);
7202
7203 struct iris_shader_state *shs = &ice->state.shaders[stage];
7204 struct pipe_resource *res = shs->sampler_table.res;
7205 if (res)
7206 iris_use_pinned_bo(batch, iris_resource_bo(res), false,
7207 IRIS_DOMAIN_NONE);
7208
7209 iris_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
7210 ptr._3DCommandSubOpcode = 43 + stage;
7211 ptr.PointertoVSSamplerState = shs->sampler_table.offset;
7212 }
7213 }
7214
7215 if (ice->state.need_border_colors)
7216 iris_use_pinned_bo(batch, border_color_pool->bo, false, IRIS_DOMAIN_NONE);
7217
7218 if (dirty & IRIS_DIRTY_MULTISAMPLE) {
7219 iris_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
7220 ms.PixelLocation =
7221 ice->state.cso_rast->half_pixel_center ? CENTER : UL_CORNER;
7222 if (ice->state.framebuffer.samples > 0)
7223 ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
7224 }
7225 }
7226
7227 if (dirty & IRIS_DIRTY_SAMPLE_MASK) {
7228 iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
7229 ms.SampleMask = ice->state.sample_mask;
7230 }
7231 }
7232
7233 #if GFX_VERx10 >= 125
7234 /* This is only used on >= gfx125 for dynamic 3DSTATE_TE and
7235 * 3DSTATE_VFG emission related workarounds.
7236 */
7237 bool program_uses_primitive_id = false;
7238
7239 /* Check if FS stage will use primitive ID overrides. */
7240 const struct intel_vue_map *last_vue_map =
7241 &iris_vue_data(ice->shaders.last_vue_shader)->vue_map;
7242 if ((fs_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
7243 last_vue_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1) {
7244 program_uses_primitive_id = true;
7245 }
7246 #endif
7247
7248 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7249 if (!(stage_dirty & (IRIS_STAGE_DIRTY_VS << stage)))
7250 continue;
7251
7252 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
7253
7254 if (shader) {
7255 struct iris_resource *cache = (void *) shader->assembly.res;
7256 iris_use_pinned_bo(batch, cache->bo, false, IRIS_DOMAIN_NONE);
7257
7258 uint32_t scratch_addr =
7259 pin_scratch_space(ice, batch, shader, stage);
7260
7261 #if GFX_VERx10 >= 125
7262 shader_program_uses_primitive_id(ice, batch, shader, stage,
7263 &program_uses_primitive_id);
7264 #endif
7265
7266 if (stage == MESA_SHADER_FRAGMENT) {
7267 UNUSED struct iris_rasterizer_state *cso = ice->state.cso_rast;
7268 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7269
7270 uint32_t ps_state[GENX(3DSTATE_PS_length)] = {0};
7271 _iris_pack_command(batch, GENX(3DSTATE_PS), ps_state, ps) {
7272 #if GFX_VER >= 9
7273 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(shader->brw_prog_data);
7274 #else
7275 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(shader->elk_prog_data);
7276 #endif
7277 intel_set_ps_dispatch_state(&ps, batch->screen->devinfo,
7278 wm_prog_data, util_framebuffer_get_num_samples(cso_fb),
7279 0 /* msaa_flags */);
7280
7281 #if GFX_VER == 12
7282 assert(fs_data->dispatch_multi == 0 ||
7283 (fs_data->dispatch_multi == 16 && fs_data->max_polygons == 2));
7284 ps.DualSIMD8DispatchEnable = fs_data->dispatch_multi;
7285 /* XXX - No major improvement observed from enabling
7286 * overlapping subspans, but it could be helpful
7287 * in theory when the requirements listed on the
7288 * BSpec page for 3DSTATE_PS_BODY are met.
7289 */
7290 ps.OverlappingSubspansEnable = false;
7291 #endif
7292
7293 #if GFX_VER >= 9
7294 ps.DispatchGRFStartRegisterForConstantSetupData0 =
7295 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
7296 ps.DispatchGRFStartRegisterForConstantSetupData1 =
7297 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
7298 #if GFX_VER < 20
7299 ps.DispatchGRFStartRegisterForConstantSetupData2 =
7300 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
7301 #endif
7302
7303 ps.KernelStartPointer0 = KSP(shader) +
7304 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
7305 ps.KernelStartPointer1 = KSP(shader) +
7306 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
7307 #if GFX_VER < 20
7308 ps.KernelStartPointer2 = KSP(shader) +
7309 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
7310 #endif
7311 #else
7312 ps.DispatchGRFStartRegisterForConstantSetupData0 =
7313 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
7314 ps.DispatchGRFStartRegisterForConstantSetupData1 =
7315 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
7316 ps.DispatchGRFStartRegisterForConstantSetupData2 =
7317 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
7318
7319 ps.KernelStartPointer0 = KSP(shader) +
7320 elk_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
7321 ps.KernelStartPointer1 = KSP(shader) +
7322 elk_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
7323 ps.KernelStartPointer2 = KSP(shader) +
7324 elk_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
7325 #endif
7326
7327 #if GFX_VERx10 >= 125
7328 ps.ScratchSpaceBuffer = scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;
7329 #else
7330 ps.ScratchSpaceBasePointer =
7331 rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);
7332 #endif
7333 }
7334
7335 uint32_t psx_state[GENX(3DSTATE_PS_EXTRA_length)] = {0};
7336 iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
7337 #if GFX_VER >= 9
7338 if (!fs_data->uses_sample_mask)
7339 psx.InputCoverageMaskState = ICMS_NONE;
7340 else if (fs_data->post_depth_coverage)
7341 psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
7342 else if (fs_data->inner_coverage &&
7343 cso->conservative_rasterization)
7344 psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;
7345 else
7346 psx.InputCoverageMaskState = ICMS_NORMAL;
7347 #else
7348 psx.PixelShaderUsesInputCoverageMask =
7349 fs_data->uses_sample_mask;
7350 #endif
7351 }
7352
7353 uint32_t *shader_ps = (uint32_t *) shader->derived_data;
7354 uint32_t *shader_psx = shader_ps + GENX(3DSTATE_PS_length);
7355 iris_emit_merge(batch, shader_ps, ps_state,
7356 GENX(3DSTATE_PS_length));
7357 iris_emit_merge(batch, shader_psx, psx_state,
7358 GENX(3DSTATE_PS_EXTRA_length));
7359 #if GFX_VERx10 >= 125
7360 } else if (stage == MESA_SHADER_TESS_EVAL) {
7361 uint32_t te_state[GENX(3DSTATE_TE_length)] = { 0 };
7362 iris_pack_command(GENX(3DSTATE_TE), te_state, te) {
7363 if (intel_needs_workaround(screen->devinfo, 14015055625) &&
7364 program_uses_primitive_id)
7365 te.TessellationDistributionMode = TEDMODE_OFF;
7366 else if (intel_needs_workaround(screen->devinfo, 22012699309))
7367 te.TessellationDistributionMode = TEDMODE_RR_STRICT;
7368 else
7369 te.TessellationDistributionMode = TEDMODE_RR_FREE;
7370 }
7371
7372 uint32_t ds_state[GENX(3DSTATE_DS_length)] = { 0 };
7373 iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
7374 if (scratch_addr)
7375 ds.ScratchSpaceBuffer =
7376 scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;
7377 }
7378
7379 uint32_t *shader_ds = (uint32_t *) shader->derived_data;
7380 uint32_t *shader_te = shader_ds + GENX(3DSTATE_DS_length);
7381
7382 iris_emit_merge(batch, shader_ds, ds_state,
7383 GENX(3DSTATE_DS_length));
7384 iris_emit_merge(batch, shader_te, te_state,
7385 GENX(3DSTATE_TE_length));
7386 #endif
7387 } else if (scratch_addr) {
7388 uint32_t *pkt = (uint32_t *) shader->derived_data;
7389 switch (stage) {
7390 case MESA_SHADER_VERTEX: MERGE_SCRATCH_ADDR(3DSTATE_VS); break;
7391 case MESA_SHADER_TESS_CTRL: MERGE_SCRATCH_ADDR(3DSTATE_HS); break;
7392 case MESA_SHADER_TESS_EVAL: {
7393 uint32_t *shader_ds = (uint32_t *) shader->derived_data;
7394 uint32_t *shader_te = shader_ds + GENX(3DSTATE_DS_length);
7395 iris_batch_emit(batch, shader_te, 4 * GENX(3DSTATE_TE_length));
7396 MERGE_SCRATCH_ADDR(3DSTATE_DS);
7397 break;
7398 }
7399 case MESA_SHADER_GEOMETRY: MERGE_SCRATCH_ADDR(3DSTATE_GS); break;
7400 }
7401 } else {
7402 iris_batch_emit(batch, shader->derived_data,
7403 iris_derived_program_state_size(stage));
7404 }
7405 } else {
7406 if (stage == MESA_SHADER_TESS_EVAL) {
7407 iris_emit_cmd(batch, GENX(3DSTATE_HS), hs);
7408 iris_emit_cmd(batch, GENX(3DSTATE_TE), te);
7409 iris_emit_cmd(batch, GENX(3DSTATE_DS), ds);
7410 } else if (stage == MESA_SHADER_GEOMETRY) {
7411 iris_emit_cmd(batch, GENX(3DSTATE_GS), gs);
7412 }
7413 }
7414 }
7415
7416 #if GFX_VERx10 >= 125
7417 /* Inspect program_uses_primitive_id state and dirty VFG if required. */
7418 if (intel_needs_workaround(batch->screen->devinfo, 14019166699) &&
7419 program_uses_primitive_id != ice->state.uses_primitive_id) {
7420 dirty |= IRIS_DIRTY_VFG;
7421 ice->state.uses_primitive_id = program_uses_primitive_id;
7422 }
7423 #endif
7424
7425 if (ice->state.streamout_active) {
7426 if (dirty & IRIS_DIRTY_SO_BUFFERS) {
7427 /* Wa_16011411144
7428 * SW must insert a PIPE_CONTROL cmd before and after the
7429 * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_* state is
7430 * not combined with other state changes.
7431 */
7432 if (intel_device_info_is_dg2(batch->screen->devinfo)) {
7433 iris_emit_pipe_control_flush(batch,
7434 "SO pre change stall WA",
7435 PIPE_CONTROL_CS_STALL);
7436 }
7437
7438 for (int i = 0; i < 4; i++) {
7439 struct iris_stream_output_target *tgt =
7440 (void *) ice->state.so_target[i];
7441 enum { dwords = GENX(3DSTATE_SO_BUFFER_length) };
7442 uint32_t *so_buffers = genx->so_buffers + i * dwords;
7443 bool zero_offset = false;
7444
7445 if (tgt) {
7446 zero_offset = tgt->zero_offset;
7447 iris_use_pinned_bo(batch, iris_resource_bo(tgt->base.buffer),
7448 true, IRIS_DOMAIN_OTHER_WRITE);
7449 iris_use_pinned_bo(batch, iris_resource_bo(tgt->offset.res),
7450 true, IRIS_DOMAIN_OTHER_WRITE);
7451 }
7452
7453 if (zero_offset) {
7454 /* Skip the last DWord which contains "Stream Offset" of
7455 * 0xFFFFFFFF and instead emit a dword of zero directly.
7456 */
7457 STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_StreamOffset_start) ==
7458 32 * (dwords - 1));
7459 const uint32_t zero = 0;
7460 iris_batch_emit(batch, so_buffers, 4 * (dwords - 1));
7461 iris_batch_emit(batch, &zero, sizeof(zero));
7462 tgt->zero_offset = false;
7463 } else {
7464 iris_batch_emit(batch, so_buffers, 4 * dwords);
7465 }
7466 }
7467
7468 /* Wa_16011411144 */
7469 if (intel_device_info_is_dg2(batch->screen->devinfo)) {
7470 iris_emit_pipe_control_flush(batch,
7471 "SO post change stall WA",
7472 PIPE_CONTROL_CS_STALL);
7473 }
7474 }
7475
7476 if ((dirty & IRIS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
7477 /* Wa_16011773973:
7478 * If SOL is enabled and SO_DECL state has to be programmed,
7479 * 1. Send 3D State SOL state with SOL disabled
7480 * 2. Send SO_DECL NP state
7481 * 3. Send 3D State SOL with SOL Enabled
7482 */
7483 if (intel_device_info_is_dg2(batch->screen->devinfo))
7484 iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
7485
7486 uint32_t *decl_list =
7487 ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
7488 iris_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
7489
7490 #if GFX_VER >= 11 && GFX_VER < 20
7491 /* ICL PRMs, Volume 2a - Command Reference: Instructions,
7492 * 3DSTATE_SO_DECL_LIST:
7493 *
7494 * "Workaround: This command must be followed by a PIPE_CONTROL
7495 * with CS Stall bit set."
7496 *
7497 * On DG2+ also known as Wa_1509820217.
7498 */
7499 iris_emit_pipe_control_flush(batch,
7500 "workaround: cs stall after so_decl",
7501 PIPE_CONTROL_CS_STALL);
7502 #endif
7503 }
7504
7505 if (dirty & IRIS_DIRTY_STREAMOUT) {
7506 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
7507
7508 #if GFX_VERx10 >= 120
7509 /* Wa_16013994831 - Disable preemption. */
7510 if (intel_needs_workaround(batch->screen->devinfo, 16013994831))
7511 iris_preemption_streamout_wa(ice, batch, false);
7512 #endif
7513
7514 uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
7515 iris_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
7516 sol.SOFunctionEnable = true;
7517 sol.SOStatisticsEnable = true;
7518
7519 sol.RenderingDisable = cso_rast->rasterizer_discard &&
7520 !ice->state.prims_generated_query_active;
7521 sol.ReorderMode = cso_rast->flatshade_first ? LEADING : TRAILING;
7522
7523
7524 #if INTEL_NEEDS_WA_18022508906
7525 /* Wa_14017076903 :
7526 *
7527 * SKL PRMs, Volume 7: 3D-Media-GPGPU, Stream Output Logic (SOL) Stage:
7528 *
7529 * SOL_INT::Render_Enable =
7530 * (3DSTATE_STREAMOUT::Force_Rending == Force_On) ||
7531 * (
7532 * (3DSTATE_STREAMOUT::Force_Rending != Force_Off) &&
7533 * !(3DSTATE_GS::Enable && 3DSTATE_GS::Output Vertex Size == 0) &&
7534 * !3DSTATE_STREAMOUT::API_Render_Disable &&
7535 * (
7536 * 3DSTATE_DEPTH_STENCIL_STATE::Stencil_TestEnable ||
7537 * 3DSTATE_DEPTH_STENCIL_STATE::Depth_TestEnable ||
7538 * 3DSTATE_DEPTH_STENCIL_STATE::Depth_WriteEnable ||
7539 * 3DSTATE_PS_EXTRA::PS_Valid ||
7540 * 3DSTATE_WM::Legacy Depth_Buffer_Clear ||
7541 * 3DSTATE_WM::Legacy Depth_Buffer_Resolve_Enable ||
7542 * 3DSTATE_WM::Legacy Hierarchical_Depth_Buffer_Resolve_Enable
7543 * )
7544 * )
7545 *
7546 * If SOL_INT::Render_Enable is false, the SO stage will not forward any
7547 * topologies down the pipeline. Which is not what we want for occlusion
7548 * queries.
7549 *
7550 * Here we force rendering to get SOL_INT::Render_Enable when occlusion
7551 * queries are active.
7552 */
7553 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
7554 if (!cso_rast->rasterizer_discard && ice->state.occlusion_query_active)
7555 sol.ForceRendering = Force_on;
7556 #endif
7557 }
7558
7559 assert(ice->state.streamout);
7560
7561 iris_emit_merge(batch, ice->state.streamout, dynamic_sol,
7562 GENX(3DSTATE_STREAMOUT_length));
7563 }
7564 } else {
7565 if (dirty & IRIS_DIRTY_STREAMOUT) {
7566
7567 #if GFX_VERx10 >= 120
7568 /* Wa_16013994831 - Enable preemption. */
7569 if (!ice->state.genx->object_preemption)
7570 iris_preemption_streamout_wa(ice, batch, true);
7571 #endif
7572
7573 iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
7574 }
7575 }
7576
7577 if (dirty & IRIS_DIRTY_CLIP) {
7578 struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
7579 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7580
7581 bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
7582 ice->shaders.prog[MESA_SHADER_TESS_EVAL];
7583 bool points_or_lines = cso_rast->fill_mode_point_or_line ||
7584 (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
7585 : ice->state.prim_is_points_or_lines);
7586 const struct intel_vue_map *last =
7587 &iris_vue_data(ice->shaders.last_vue_shader)->vue_map;
7588
7589 uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
7590 iris_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
7591 cl.StatisticsEnable = ice->state.statistics_counters_enabled;
7592 if (cso_rast->rasterizer_discard)
7593 cl.ClipMode = CLIPMODE_REJECT_ALL;
7594 else if (ice->state.window_space_position)
7595 cl.ClipMode = CLIPMODE_ACCEPT_ALL;
7596 else
7597 cl.ClipMode = CLIPMODE_NORMAL;
7598
7599 cl.PerspectiveDivideDisable = ice->state.window_space_position;
7600 cl.ViewportXYClipTestEnable = !points_or_lines;
7601
7602 cl.NonPerspectiveBarycentricEnable = fs_data->uses_nonperspective_interp_modes;
7603
7604 cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1 ||
7605 !(last->slots_valid & VARYING_BIT_LAYER);
7606 cl.MaximumVPIndex = ice->state.num_viewports - 1;
7607 }
7608 iris_emit_merge(batch, cso_rast->clip, dynamic_clip,
7609 ARRAY_SIZE(cso_rast->clip));
7610 }
7611
7612 if (dirty & (IRIS_DIRTY_RASTER | IRIS_DIRTY_URB)) {
7613 /* From the Browadwell PRM, Volume 2, documentation for
7614 * 3DSTATE_RASTER, "Antialiasing Enable":
7615 *
7616 * "This field must be disabled if any of the render targets
7617 * have integer (UINT or SINT) surface format."
7618 *
7619 * Additionally internal documentation for Gfx12+ states:
7620 *
7621 * "This bit MUST not be set when NUM_MULTISAMPLES > 1 OR
7622 * FORCED_SAMPLE_COUNT > 1."
7623 */
7624 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7625 unsigned samples = util_framebuffer_get_num_samples(cso_fb);
7626 struct iris_rasterizer_state *cso = ice->state.cso_rast;
7627
7628 bool aa_enable = cso->line_smooth &&
7629 !ice->state.has_integer_rt &&
7630 !(batch->screen->devinfo->ver >= 12 && samples > 1);
7631
7632 uint32_t dynamic_raster[GENX(3DSTATE_RASTER_length)];
7633 iris_pack_command(GENX(3DSTATE_RASTER), &dynamic_raster, raster) {
7634 raster.AntialiasingEnable = aa_enable;
7635 }
7636 iris_emit_merge(batch, cso->raster, dynamic_raster,
7637 ARRAY_SIZE(cso->raster));
7638
7639 uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
7640 iris_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
7641 sf.ViewportTransformEnable = !ice->state.window_space_position;
7642
7643 #if GFX_VER >= 12
7644 sf.DerefBlockSize = ice->state.urb_deref_block_size;
7645 #endif
7646 }
7647 iris_emit_merge(batch, cso->sf, dynamic_sf,
7648 ARRAY_SIZE(dynamic_sf));
7649 }
7650
7651 if (dirty & IRIS_DIRTY_WM) {
7652 struct iris_rasterizer_state *cso = ice->state.cso_rast;
7653 uint32_t dynamic_wm[GENX(3DSTATE_WM_length)];
7654
7655 iris_pack_command(GENX(3DSTATE_WM), &dynamic_wm, wm) {
7656 wm.StatisticsEnable = ice->state.statistics_counters_enabled;
7657
7658 wm.BarycentricInterpolationMode =
7659 iris_fs_barycentric_modes(ice->shaders.prog[MESA_SHADER_FRAGMENT], 0);
7660
7661 if (fs_data->early_fragment_tests)
7662 wm.EarlyDepthStencilControl = EDSC_PREPS;
7663 else if (fs_data->has_side_effects)
7664 wm.EarlyDepthStencilControl = EDSC_PSEXEC;
7665 else
7666 wm.EarlyDepthStencilControl = EDSC_NORMAL;
7667
7668 /* We could skip this bit if color writes are enabled. */
7669 if (fs_data->has_side_effects || fs_data->uses_kill)
7670 wm.ForceThreadDispatchEnable = ForceON;
7671 }
7672 iris_emit_merge(batch, cso->wm, dynamic_wm, ARRAY_SIZE(cso->wm));
7673 }
7674
7675 if (dirty & IRIS_DIRTY_SBE) {
7676 iris_emit_sbe(batch, ice);
7677 }
7678
7679 if (dirty & IRIS_DIRTY_PS_BLEND) {
7680 struct iris_blend_state *cso_blend = ice->state.cso_blend;
7681 struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
7682 const struct shader_info *fs_info =
7683 iris_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7684
7685 int dst_blend_factor = cso_blend->ps_dst_blend_factor[0];
7686 int dst_alpha_blend_factor = cso_blend->ps_dst_alpha_blend_factor[0];
7687
7688 /* When MSAA is enabled, instead of using BLENDFACTOR_ZERO use
7689 * CONST_COLOR, CONST_ALPHA and supply zero by using blend constants.
7690 */
7691 if (needs_wa_14018912822) {
7692 if (ice->state.color_blend_zero)
7693 dst_blend_factor = BLENDFACTOR_CONST_COLOR;
7694 if (ice->state.alpha_blend_zero)
7695 dst_alpha_blend_factor = BLENDFACTOR_CONST_ALPHA;
7696 }
7697
7698 uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)];
7699 iris_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) {
7700 pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info);
7701 pb.AlphaTestEnable = cso_zsa->alpha_enabled;
7702
7703 pb.DestinationBlendFactor = dst_blend_factor;
7704 pb.DestinationAlphaBlendFactor = dst_alpha_blend_factor;
7705
7706 /* The dual source blending docs caution against using SRC1 factors
7707 * when the shader doesn't use a dual source render target write.
7708 * Empirically, this can lead to GPU hangs, and the results are
7709 * undefined anyway, so simply disable blending to avoid the hang.
7710 */
7711 pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) &&
7712 (!cso_blend->dual_color_blending || fs_data->dual_src_blend);
7713 }
7714
7715 iris_emit_merge(batch, cso_blend->ps_blend, dynamic_pb,
7716 ARRAY_SIZE(cso_blend->ps_blend));
7717 }
7718
7719 if (dirty & IRIS_DIRTY_WM_DEPTH_STENCIL) {
7720 struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
7721 #if GFX_VER >= 9 && GFX_VER < 12
7722 struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
7723 uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
7724 iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {
7725 wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];
7726 wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
7727 }
7728 iris_emit_merge(batch, cso->wmds, stencil_refs, ARRAY_SIZE(cso->wmds));
7729 #else
7730 /* Use modify disable fields which allow us to emit packets
7731 * directly instead of merging them later.
7732 */
7733 iris_batch_emit(batch, cso->wmds, sizeof(cso->wmds));
7734 #endif
7735
7736 /* Depth or stencil write changed in cso. */
7737 if (intel_needs_workaround(batch->screen->devinfo, 18019816803) &&
7738 (dirty & IRIS_DIRTY_DS_WRITE_ENABLE)) {
7739 iris_emit_pipe_control_flush(
7740 batch, "workaround: PSS stall after DS write enable change",
7741 PIPE_CONTROL_PSS_STALL_SYNC);
7742 }
7743
7744 #if GFX_VER >= 12
7745 iris_batch_emit(batch, cso->depth_bounds, sizeof(cso->depth_bounds));
7746 #endif
7747 }
7748
7749 if (dirty & IRIS_DIRTY_STENCIL_REF) {
7750 #if GFX_VER >= 12
7751 /* Use modify disable fields which allow us to emit packets
7752 * directly instead of merging them later.
7753 */
7754 struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
7755 uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
7756 iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {
7757 wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];
7758 wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
7759 wmds.StencilTestMaskModifyDisable = true;
7760 wmds.StencilWriteMaskModifyDisable = true;
7761 wmds.StencilStateModifyDisable = true;
7762 wmds.DepthStateModifyDisable = true;
7763 }
7764 iris_batch_emit(batch, stencil_refs, sizeof(stencil_refs));
7765 #endif
7766 }
7767
7768 if (dirty & IRIS_DIRTY_SCISSOR_RECT) {
7769 /* Wa_1409725701:
7770 * "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
7771 * stored as an array of up to 16 elements. The location of first
7772 * element of the array, as specified by Pointer to SCISSOR_RECT,
7773 * should be aligned to a 64-byte boundary.
7774 */
7775 uint32_t alignment = 64;
7776 uint32_t scissor_offset =
7777 emit_state(batch, ice->state.dynamic_uploader,
7778 &ice->state.last_res.scissor,
7779 ice->state.scissors,
7780 sizeof(struct pipe_scissor_state) *
7781 ice->state.num_viewports, alignment);
7782
7783 iris_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
7784 ptr.ScissorRectPointer = scissor_offset;
7785 }
7786 }
7787
7788 if (dirty & IRIS_DIRTY_DEPTH_BUFFER) {
7789 struct iris_depth_buffer_state *cso_z = &ice->state.genx->depth_buffer;
7790
7791 /* Do not emit the cso yet. We may need to update clear params first. */
7792 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7793 struct iris_resource *zres = NULL, *sres = NULL;
7794 if (cso_fb->zsbuf) {
7795 iris_get_depth_stencil_resources(cso_fb->zsbuf->texture,
7796 &zres, &sres);
7797 }
7798
7799 if (zres && ice->state.hiz_usage != ISL_AUX_USAGE_NONE) {
7800 #if GFX_VER < 20
7801 uint32_t *clear_params =
7802 cso_z->packets + ARRAY_SIZE(cso_z->packets) -
7803 GENX(3DSTATE_CLEAR_PARAMS_length);
7804
7805 iris_pack_command(GENX(3DSTATE_CLEAR_PARAMS), clear_params, clear) {
7806 clear.DepthClearValueValid = true;
7807 clear.DepthClearValue = zres->aux.clear_color.f32[0];
7808 }
7809 #endif
7810 }
7811
7812 iris_batch_emit(batch, cso_z->packets, sizeof(cso_z->packets));
7813
7814 if (intel_needs_workaround(batch->screen->devinfo, 1408224581) ||
7815 intel_needs_workaround(batch->screen->devinfo, 14014097488) ||
7816 intel_needs_workaround(batch->screen->devinfo, 14016712196)) {
7817 /* Wa_1408224581
7818 *
7819 * Workaround: Gfx12LP Astep only An additional pipe control with
7820 * post-sync = store dword operation would be required.( w/a is to
7821 * have an additional pipe control after the stencil state whenever
7822 * the surface state bits of this state is changing).
7823 *
7824 * This also seems sufficient to handle Wa_14014097488 and
7825 * Wa_14016712196.
7826 */
7827 iris_emit_pipe_control_write(batch, "WA for depth/stencil state",
7828 PIPE_CONTROL_WRITE_IMMEDIATE,
7829 screen->workaround_address.bo,
7830 screen->workaround_address.offset, 0);
7831 }
7832
7833 if (zres)
7834 genX(emit_depth_state_workarounds)(ice, batch, &zres->surf);
7835 }
7836
7837 if (dirty & (IRIS_DIRTY_DEPTH_BUFFER | IRIS_DIRTY_WM_DEPTH_STENCIL)) {
7838 /* Listen for buffer changes, and also write enable changes. */
7839 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7840 pin_depth_and_stencil_buffers(batch, cso_fb->zsbuf, ice->state.cso_zsa);
7841 }
7842
7843 if (dirty & IRIS_DIRTY_POLYGON_STIPPLE) {
7844 iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
7845 for (int i = 0; i < 32; i++) {
7846 poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
7847 }
7848 }
7849 }
7850
7851 if (dirty & IRIS_DIRTY_LINE_STIPPLE) {
7852 struct iris_rasterizer_state *cso = ice->state.cso_rast;
7853 iris_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
7854 #if GFX_VER >= 11
7855 /* ICL PRMs, Volume 2a - Command Reference: Instructions,
7856 * 3DSTATE_LINE_STIPPLE:
7857 *
7858 * "Workaround: This command must be followed by a PIPE_CONTROL with
7859 * CS Stall bit set."
7860 */
7861 iris_emit_pipe_control_flush(batch,
7862 "workaround: post 3DSTATE_LINE_STIPPLE",
7863 PIPE_CONTROL_CS_STALL);
7864 #endif
7865 }
7866
7867 if (dirty & IRIS_DIRTY_VF_TOPOLOGY) {
7868 iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
7869 topo.PrimitiveTopologyType =
7870 translate_prim_type(draw->mode, ice->state.vertices_per_patch);
7871 }
7872 }
7873
7874 if (dirty & IRIS_DIRTY_VERTEX_BUFFERS) {
7875 int count = util_bitcount64(ice->state.bound_vertex_buffers);
7876 uint64_t dynamic_bound = ice->state.bound_vertex_buffers;
7877
7878 if (ice->state.vs_uses_draw_params && !skip_vb_params) {
7879 assert(ice->draw.draw_params.res);
7880
7881 struct iris_vertex_buffer_state *state =
7882 &(ice->state.genx->vertex_buffers[count]);
7883 pipe_resource_reference(&state->resource, ice->draw.draw_params.res);
7884 struct iris_resource *res = (void *) state->resource;
7885
7886 iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
7887 vb.VertexBufferIndex = count;
7888 vb.AddressModifyEnable = true;
7889 vb.BufferPitch = 0;
7890 vb.BufferSize = res->bo->size - ice->draw.draw_params.offset;
7891 vb.BufferStartingAddress =
7892 ro_bo(NULL, res->bo->address +
7893 (int) ice->draw.draw_params.offset);
7894 vb.MOCS = iris_mocs(res->bo, &screen->isl_dev,
7895 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
7896 #if GFX_VER >= 12
7897 vb.L3BypassDisable = true;
7898 #endif
7899 }
7900 dynamic_bound |= 1ull << count;
7901 count++;
7902 }
7903
7904 if (ice->state.vs_uses_derived_draw_params && !skip_vb_params) {
7905 struct iris_vertex_buffer_state *state =
7906 &(ice->state.genx->vertex_buffers[count]);
7907 pipe_resource_reference(&state->resource,
7908 ice->draw.derived_draw_params.res);
7909 struct iris_resource *res = (void *) ice->draw.derived_draw_params.res;
7910
7911 iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
7912 vb.VertexBufferIndex = count;
7913 vb.AddressModifyEnable = true;
7914 vb.BufferPitch = 0;
7915 vb.BufferSize =
7916 res->bo->size - ice->draw.derived_draw_params.offset;
7917 vb.BufferStartingAddress =
7918 ro_bo(NULL, res->bo->address +
7919 (int) ice->draw.derived_draw_params.offset);
7920 vb.MOCS = iris_mocs(res->bo, &screen->isl_dev,
7921 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
7922 #if GFX_VER >= 12
7923 vb.L3BypassDisable = true;
7924 #endif
7925 }
7926 dynamic_bound |= 1ull << count;
7927 count++;
7928 }
7929
7930 if (count) {
7931 #if GFX_VER >= 11
7932 /* Gfx11+ doesn't need the cache workaround below */
7933 uint64_t bound = dynamic_bound;
7934 while (bound) {
7935 const int i = u_bit_scan64(&bound);
7936 iris_use_optional_res(batch, genx->vertex_buffers[i].resource,
7937 false, IRIS_DOMAIN_VF_READ);
7938 }
7939 #else
7940 /* The VF cache designers cut corners, and made the cache key's
7941 * <VertexBufferIndex, Memory Address> tuple only consider the bottom
7942 * 32 bits of the address. If you have two vertex buffers which get
7943 * placed exactly 4 GiB apart and use them in back-to-back draw calls,
7944 * you can get collisions (even within a single batch).
7945 *
7946 * So, we need to do a VF cache invalidate if the buffer for a VB
7947 * slot slot changes [48:32] address bits from the previous time.
7948 */
7949 unsigned flush_flags = 0;
7950
7951 uint64_t bound = dynamic_bound;
7952 while (bound) {
7953 const int i = u_bit_scan64(&bound);
7954 uint16_t high_bits = 0;
7955
7956 struct iris_resource *res =
7957 (void *) genx->vertex_buffers[i].resource;
7958 if (res) {
7959 iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_VF_READ);
7960
7961 high_bits = res->bo->address >> 32ull;
7962 if (high_bits != ice->state.last_vbo_high_bits[i]) {
7963 flush_flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE |
7964 PIPE_CONTROL_CS_STALL;
7965 ice->state.last_vbo_high_bits[i] = high_bits;
7966 }
7967 }
7968 }
7969
7970 if (flush_flags) {
7971 iris_emit_pipe_control_flush(batch,
7972 "workaround: VF cache 32-bit key [VB]",
7973 flush_flags);
7974 }
7975 #endif
7976
7977 const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
7978
7979 uint32_t *map =
7980 iris_get_command_space(batch, 4 * (1 + vb_dwords * count));
7981 _iris_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
7982 vb.DWordLength = (vb_dwords * count + 1) - 2;
7983 }
7984 map += 1;
7985
7986 const struct iris_vertex_element_state *cso_ve =
7987 ice->state.cso_vertex_elements;
7988
7989 bound = dynamic_bound;
7990 while (bound) {
7991 const int i = u_bit_scan64(&bound);
7992
7993 uint32_t vb_stride[GENX(VERTEX_BUFFER_STATE_length)];
7994 struct iris_bo *bo =
7995 iris_resource_bo(genx->vertex_buffers[i].resource);
7996 iris_pack_state(GENX(VERTEX_BUFFER_STATE), &vb_stride, vbs) {
7997 vbs.BufferPitch = cso_ve->stride[i];
7998 /* Unnecessary except to defeat the genxml nonzero checker */
7999 vbs.MOCS = iris_mocs(bo, &screen->isl_dev,
8000 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
8001 }
8002 for (unsigned d = 0; d < vb_dwords; d++)
8003 map[d] = genx->vertex_buffers[i].state[d] | vb_stride[d];
8004
8005 map += vb_dwords;
8006 }
8007 }
8008 }
8009
8010 if (dirty & IRIS_DIRTY_VERTEX_ELEMENTS) {
8011 struct iris_vertex_element_state *cso = ice->state.cso_vertex_elements;
8012 const unsigned entries = MAX2(cso->count, 1);
8013 if (!(ice->state.vs_needs_sgvs_element ||
8014 ice->state.vs_uses_derived_draw_params ||
8015 ice->state.vs_needs_edge_flag)) {
8016 iris_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
8017 (1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
8018 } else {
8019 uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
8020 const unsigned dyn_count = cso->count +
8021 ice->state.vs_needs_sgvs_element +
8022 ice->state.vs_uses_derived_draw_params;
8023
8024 iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
8025 &dynamic_ves, ve) {
8026 ve.DWordLength =
8027 1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
8028 }
8029 memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
8030 (cso->count - ice->state.vs_needs_edge_flag) *
8031 GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
8032 uint32_t *ve_pack_dest =
8033 &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
8034 GENX(VERTEX_ELEMENT_STATE_length)];
8035
8036 if (ice->state.vs_needs_sgvs_element) {
8037 uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
8038 VFCOMP_STORE_SRC : VFCOMP_STORE_0;
8039 iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
8040 ve.Valid = true;
8041 ve.VertexBufferIndex =
8042 util_bitcount64(ice->state.bound_vertex_buffers);
8043 ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
8044 ve.Component0Control = base_ctrl;
8045 ve.Component1Control = base_ctrl;
8046 ve.Component2Control = VFCOMP_STORE_0;
8047 ve.Component3Control = VFCOMP_STORE_0;
8048 }
8049 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
8050 }
8051 if (ice->state.vs_uses_derived_draw_params) {
8052 iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
8053 ve.Valid = true;
8054 ve.VertexBufferIndex =
8055 util_bitcount64(ice->state.bound_vertex_buffers) +
8056 ice->state.vs_uses_draw_params;
8057 ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
8058 ve.Component0Control = VFCOMP_STORE_SRC;
8059 ve.Component1Control = VFCOMP_STORE_SRC;
8060 ve.Component2Control = VFCOMP_STORE_0;
8061 ve.Component3Control = VFCOMP_STORE_0;
8062 }
8063 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
8064 }
8065 if (ice->state.vs_needs_edge_flag) {
8066 for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length); i++)
8067 ve_pack_dest[i] = cso->edgeflag_ve[i];
8068 }
8069
8070 iris_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
8071 (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
8072 }
8073
8074 if (!ice->state.vs_needs_edge_flag) {
8075 iris_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *
8076 entries * GENX(3DSTATE_VF_INSTANCING_length));
8077 } else {
8078 assert(cso->count > 0);
8079 const unsigned edgeflag_index = cso->count - 1;
8080 uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)];
8081 memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index *
8082 GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t));
8083
8084 uint32_t *vfi_pack_dest = &dynamic_vfi[0] +
8085 edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length);
8086 iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
8087 vi.VertexElementIndex = edgeflag_index +
8088 ice->state.vs_needs_sgvs_element +
8089 ice->state.vs_uses_derived_draw_params;
8090 }
8091 for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length); i++)
8092 vfi_pack_dest[i] |= cso->edgeflag_vfi[i];
8093
8094 iris_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) *
8095 entries * GENX(3DSTATE_VF_INSTANCING_length));
8096 }
8097 }
8098
8099 if (dirty & IRIS_DIRTY_VF_SGVS) {
8100 const struct iris_vs_data *vs_data =
8101 iris_vs_data(ice->shaders.prog[MESA_SHADER_VERTEX]);
8102 struct iris_vertex_element_state *cso = ice->state.cso_vertex_elements;
8103
8104 iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) {
8105 if (vs_data->uses_vertexid) {
8106 sgv.VertexIDEnable = true;
8107 sgv.VertexIDComponentNumber = 2;
8108 sgv.VertexIDElementOffset =
8109 cso->count - ice->state.vs_needs_edge_flag;
8110 }
8111
8112 if (vs_data->uses_instanceid) {
8113 sgv.InstanceIDEnable = true;
8114 sgv.InstanceIDComponentNumber = 3;
8115 sgv.InstanceIDElementOffset =
8116 cso->count - ice->state.vs_needs_edge_flag;
8117 }
8118 }
8119 }
8120
8121 if (dirty & IRIS_DIRTY_VF) {
8122 iris_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
8123 #if GFX_VERx10 >= 125
8124 vf.GeometryDistributionEnable = true;
8125 #endif
8126 if (draw->primitive_restart) {
8127 vf.IndexedDrawCutIndexEnable = true;
8128 vf.CutIndex = draw->restart_index;
8129 }
8130 }
8131 }
8132
8133 #if GFX_VERx10 >= 125
8134 if (dirty & IRIS_DIRTY_VFG) {
8135 iris_emit_cmd(batch, GENX(3DSTATE_VFG), vfg) {
8136 /* If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE*/
8137 vfg.DistributionMode =
8138 ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL ? RR_STRICT :
8139 RR_FREE;
8140 if (intel_needs_workaround(batch->screen->devinfo, 14019166699) &&
8141 program_uses_primitive_id)
8142 vfg.DistributionGranularity = InstanceLevelGranularity;
8143 else
8144 vfg.DistributionGranularity = BatchLevelGranularity;
8145 #if INTEL_WA_14014851047_GFX_VER
8146 vfg.GranularityThresholdDisable =
8147 intel_needs_workaround(batch->screen->devinfo, 14014851047);
8148 #endif
8149 vfg.ListCutIndexEnable = draw->primitive_restart;
8150 /* 192 vertices for TRILIST_ADJ */
8151 vfg.ListNBatchSizeScale = 0;
8152 /* Batch size of 384 vertices */
8153 vfg.List3BatchSizeScale = 2;
8154 /* Batch size of 128 vertices */
8155 vfg.List2BatchSizeScale = 1;
8156 /* Batch size of 128 vertices */
8157 vfg.List1BatchSizeScale = 2;
8158 /* Batch size of 256 vertices for STRIP topologies */
8159 vfg.StripBatchSizeScale = 3;
8160 /* 192 control points for PATCHLIST_3 */
8161 vfg.PatchBatchSizeScale = 1;
8162 /* 192 control points for PATCHLIST_3 */
8163 vfg.PatchBatchSizeMultiplier = 31;
8164 }
8165 }
8166 #endif
8167
8168 if (dirty & IRIS_DIRTY_VF_STATISTICS) {
8169 iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
8170 vf.StatisticsEnable = true;
8171 }
8172 }
8173
8174 #if GFX_VER == 8
8175 if (dirty & IRIS_DIRTY_PMA_FIX) {
8176 bool enable = want_pma_fix(ice);
8177 genX(update_pma_fix)(ice, batch, enable);
8178 }
8179 #endif
8180
8181 if (ice->state.current_hash_scale != 1)
8182 genX(emit_hashing_mode)(ice, batch, UINT_MAX, UINT_MAX, 1);
8183
8184 #if GFX_VER >= 12
8185 genX(invalidate_aux_map_state)(batch);
8186 #endif
8187 }
8188
8189 static void
flush_vbos(struct iris_context * ice,struct iris_batch * batch)8190 flush_vbos(struct iris_context *ice, struct iris_batch *batch)
8191 {
8192 struct iris_genx_state *genx = ice->state.genx;
8193 uint64_t bound = ice->state.bound_vertex_buffers;
8194 while (bound) {
8195 const int i = u_bit_scan64(&bound);
8196 struct iris_bo *bo = iris_resource_bo(genx->vertex_buffers[i].resource);
8197 iris_emit_buffer_barrier_for(batch, bo, IRIS_DOMAIN_VF_READ);
8198 }
8199 }
8200
8201 static bool
point_or_line_list(enum mesa_prim prim_type)8202 point_or_line_list(enum mesa_prim prim_type)
8203 {
8204 switch (prim_type) {
8205 case MESA_PRIM_POINTS:
8206 case MESA_PRIM_LINES:
8207 case MESA_PRIM_LINE_STRIP:
8208 case MESA_PRIM_LINES_ADJACENCY:
8209 case MESA_PRIM_LINE_STRIP_ADJACENCY:
8210 case MESA_PRIM_LINE_LOOP:
8211 return true;
8212 default:
8213 return false;
8214 }
8215 return false;
8216 }
8217
8218 void
genX(emit_breakpoint)8219 genX(emit_breakpoint)(struct iris_batch *batch, bool emit_before_draw)
8220 {
8221 struct iris_context *ice = batch->ice;
8222 uint32_t draw_count = emit_before_draw ?
8223 p_atomic_inc_return(&ice->draw_call_count) :
8224 p_atomic_read(&ice->draw_call_count);
8225
8226 if (((draw_count == intel_debug_bkp_before_draw_count &&
8227 emit_before_draw) ||
8228 (draw_count == intel_debug_bkp_after_draw_count &&
8229 !emit_before_draw))) {
8230 iris_emit_cmd(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
8231 sem.WaitMode = PollingMode;
8232 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
8233 sem.SemaphoreDataDword = 0x1;
8234 sem.SemaphoreAddress = rw_bo(batch->screen->breakpoint_bo, 0,
8235 IRIS_DOMAIN_OTHER_WRITE);
8236 };
8237 }
8238 }
8239
8240 void
genX(emit_3dprimitive_was)8241 genX(emit_3dprimitive_was)(struct iris_batch *batch,
8242 const struct pipe_draw_indirect_info *indirect,
8243 uint32_t primitive_type,
8244 uint32_t vertex_count)
8245 {
8246 UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
8247 UNUSED const struct iris_context *ice = batch->ice;
8248
8249 #if INTEL_WA_22014412737_GFX_VER || INTEL_WA_16014538804_GFX_VER
8250 if (intel_needs_workaround(devinfo, 22014412737) &&
8251 (point_or_line_list(primitive_type) || indirect ||
8252 (vertex_count == 1 || vertex_count == 2))) {
8253 iris_emit_pipe_control_write(batch, "Wa_22014412737",
8254 PIPE_CONTROL_WRITE_IMMEDIATE,
8255 batch->screen->workaround_bo,
8256 batch->screen->workaround_address.offset,
8257 0ull);
8258 batch->num_3d_primitives_emitted = 0;
8259 } else if (intel_needs_workaround(devinfo, 16014538804)) {
8260 batch->num_3d_primitives_emitted++;
8261
8262 /* Wa_16014538804 - Send empty/dummy pipe control after 3 3DPRIMITIVE. */
8263 if (batch->num_3d_primitives_emitted == 3) {
8264 iris_emit_pipe_control_flush(batch, "Wa_16014538804", 0);
8265 batch->num_3d_primitives_emitted = 0;
8266 }
8267 }
8268 #endif
8269 }
8270
8271 void
genX(urb_workaround)8272 genX(urb_workaround)(struct iris_batch *batch,
8273 const struct intel_urb_config *urb_cfg)
8274 {
8275 #if INTEL_NEEDS_WA_16014912113
8276 if (intel_urb_setup_changed(urb_cfg, &batch->ice->shaders.last_urb,
8277 MESA_SHADER_TESS_EVAL) &&
8278 batch->ice->shaders.last_urb.size[0] != 0) {
8279 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
8280 iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
8281 urb._3DCommandSubOpcode += i;
8282 urb.VSURBStartingAddress =
8283 batch->ice->shaders.last_urb.start[i];
8284 urb.VSURBEntryAllocationSize =
8285 batch->ice->shaders.last_urb.size[i] - 1;
8286 urb.VSNumberofURBEntries = i == 0 ? 256 : 0;
8287 }
8288 }
8289 iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
8290 pc.HDCPipelineFlushEnable = true;
8291 }
8292 }
8293 #endif
8294
8295 /* Update current urb config. */
8296 memcpy(&batch->ice->shaders.last_urb, &batch->ice->shaders.urb.cfg,
8297 sizeof(struct intel_urb_config));
8298 }
8299
8300 static void
iris_emit_index_buffer(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw,const struct pipe_draw_start_count_bias * sc)8301 iris_emit_index_buffer(struct iris_context *ice,
8302 struct iris_batch *batch,
8303 const struct pipe_draw_info *draw,
8304 const struct pipe_draw_start_count_bias *sc)
8305 {
8306 unsigned offset;
8307
8308 if (draw->has_user_indices) {
8309 unsigned start_offset = draw->index_size * sc->start;
8310
8311 u_upload_data(ice->ctx.const_uploader, start_offset,
8312 sc->count * draw->index_size, 4,
8313 (char*)draw->index.user + start_offset,
8314 &offset, &ice->state.last_res.index_buffer);
8315 offset -= start_offset;
8316 } else {
8317 struct iris_resource *res = (void *) draw->index.resource;
8318 res->bind_history |= PIPE_BIND_INDEX_BUFFER;
8319
8320 pipe_resource_reference(&ice->state.last_res.index_buffer,
8321 draw->index.resource);
8322 offset = 0;
8323
8324 iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_VF_READ);
8325 }
8326
8327 struct iris_genx_state *genx = ice->state.genx;
8328 struct iris_bo *bo = iris_resource_bo(ice->state.last_res.index_buffer);
8329
8330 uint32_t ib_packet[GENX(3DSTATE_INDEX_BUFFER_length)];
8331 iris_pack_command(GENX(3DSTATE_INDEX_BUFFER), ib_packet, ib) {
8332 ib.IndexFormat = draw->index_size >> 1;
8333 ib.MOCS = iris_mocs(bo, &batch->screen->isl_dev,
8334 ISL_SURF_USAGE_INDEX_BUFFER_BIT);
8335 ib.BufferSize = bo->size - offset;
8336 ib.BufferStartingAddress = ro_bo(NULL, bo->address + offset);
8337 #if GFX_VER >= 12
8338 ib.L3BypassDisable = true;
8339 #endif
8340 }
8341
8342 if (memcmp(genx->last_index_buffer, ib_packet, sizeof(ib_packet)) != 0) {
8343 memcpy(genx->last_index_buffer, ib_packet, sizeof(ib_packet));
8344 iris_batch_emit(batch, ib_packet, sizeof(ib_packet));
8345 iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_VF_READ);
8346 }
8347
8348 #if GFX_VER < 11
8349 /* The VF cache key only uses 32-bits, see vertex buffer comment above */
8350 uint16_t high_bits = bo->address >> 32ull;
8351 if (high_bits != ice->state.last_index_bo_high_bits) {
8352 iris_emit_pipe_control_flush(batch,
8353 "workaround: VF cache 32-bit key [IB]",
8354 PIPE_CONTROL_VF_CACHE_INVALIDATE |
8355 PIPE_CONTROL_CS_STALL);
8356 ice->state.last_index_bo_high_bits = high_bits;
8357 }
8358 #endif
8359 }
8360
8361
8362 static void
iris_upload_render_state(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * sc)8363 iris_upload_render_state(struct iris_context *ice,
8364 struct iris_batch *batch,
8365 const struct pipe_draw_info *draw,
8366 unsigned drawid_offset,
8367 const struct pipe_draw_indirect_info *indirect,
8368 const struct pipe_draw_start_count_bias *sc)
8369 {
8370 UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
8371 bool use_predicate = ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
8372
8373 trace_intel_begin_draw(&batch->trace);
8374
8375 if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
8376 flush_vbos(ice, batch);
8377
8378 iris_batch_sync_region_start(batch);
8379
8380 /* Always pin the binder. If we're emitting new binding table pointers,
8381 * we need it. If not, we're probably inheriting old tables via the
8382 * context, and need it anyway. Since true zero-bindings cases are
8383 * practically non-existent, just pin it and avoid last_res tracking.
8384 */
8385 iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8386 IRIS_DOMAIN_NONE);
8387
8388 if (!batch->contains_draw) {
8389 if (GFX_VER == 12) {
8390 /* Re-emit constants when starting a new batch buffer in order to
8391 * work around push constant corruption on context switch.
8392 *
8393 * XXX - Provide hardware spec quotation when available.
8394 */
8395 ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS |
8396 IRIS_STAGE_DIRTY_CONSTANTS_TCS |
8397 IRIS_STAGE_DIRTY_CONSTANTS_TES |
8398 IRIS_STAGE_DIRTY_CONSTANTS_GS |
8399 IRIS_STAGE_DIRTY_CONSTANTS_FS);
8400 }
8401 batch->contains_draw = true;
8402 }
8403
8404 if (!batch->contains_draw_with_next_seqno) {
8405 iris_restore_render_saved_bos(ice, batch, draw);
8406 batch->contains_draw_with_next_seqno = true;
8407 }
8408
8409 /* Wa_1306463417 - Send HS state for every primitive on gfx11.
8410 * Wa_16011107343 (same for gfx12)
8411 * We implement this by setting TCS dirty on each draw.
8412 */
8413 if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
8414 ice->shaders.prog[MESA_SHADER_TESS_CTRL]) {
8415 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS;
8416 }
8417
8418 iris_upload_dirty_render_state(ice, batch, draw, false);
8419
8420 if (draw->index_size > 0)
8421 iris_emit_index_buffer(ice, batch, draw, sc);
8422
8423 if (indirect) {
8424 struct mi_builder b;
8425 uint32_t mocs;
8426 mi_builder_init(&b, batch->screen->devinfo, batch);
8427
8428 #define _3DPRIM_END_OFFSET 0x2420
8429 #define _3DPRIM_START_VERTEX 0x2430
8430 #define _3DPRIM_VERTEX_COUNT 0x2434
8431 #define _3DPRIM_INSTANCE_COUNT 0x2438
8432 #define _3DPRIM_START_INSTANCE 0x243C
8433 #define _3DPRIM_BASE_VERTEX 0x2440
8434
8435 if (!indirect->count_from_stream_output) {
8436 if (indirect->indirect_draw_count) {
8437 use_predicate = true;
8438
8439 struct iris_bo *draw_count_bo =
8440 iris_resource_bo(indirect->indirect_draw_count);
8441 unsigned draw_count_offset =
8442 indirect->indirect_draw_count_offset;
8443 mocs = iris_mocs(draw_count_bo, &batch->screen->isl_dev, 0);
8444 mi_builder_set_mocs(&b, mocs);
8445
8446 if (ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT) {
8447 /* comparison = draw id < draw count */
8448 struct mi_value comparison =
8449 mi_ult(&b, mi_imm(drawid_offset),
8450 mi_mem32(ro_bo(draw_count_bo, draw_count_offset)));
8451
8452 /* predicate = comparison & conditional rendering predicate */
8453 mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
8454 mi_iand(&b, comparison, mi_reg32(CS_GPR(15))));
8455 } else {
8456 uint32_t mi_predicate;
8457
8458 /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
8459 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(drawid_offset));
8460 /* Upload the current draw count from the draw parameters buffer
8461 * to MI_PREDICATE_SRC0. Zero the top 32-bits of
8462 * MI_PREDICATE_SRC0.
8463 */
8464 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
8465 mi_mem32(ro_bo(draw_count_bo, draw_count_offset)));
8466
8467 if (drawid_offset == 0) {
8468 mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
8469 MI_PREDICATE_COMBINEOP_SET |
8470 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
8471 } else {
8472 /* While draw_index < draw_count the predicate's result will be
8473 * (draw_index == draw_count) ^ TRUE = TRUE
8474 * When draw_index == draw_count the result is
8475 * (TRUE) ^ TRUE = FALSE
8476 * After this all results will be:
8477 * (FALSE) ^ FALSE = FALSE
8478 */
8479 mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
8480 MI_PREDICATE_COMBINEOP_XOR |
8481 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
8482 }
8483 iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
8484 }
8485 }
8486 struct iris_bo *bo = iris_resource_bo(indirect->buffer);
8487 assert(bo);
8488
8489 mocs = iris_mocs(bo, &batch->screen->isl_dev, 0);
8490 mi_builder_set_mocs(&b, mocs);
8491
8492 mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
8493 mi_mem32(ro_bo(bo, indirect->offset + 0)));
8494 mi_store(&b, mi_reg32(_3DPRIM_INSTANCE_COUNT),
8495 mi_mem32(ro_bo(bo, indirect->offset + 4)));
8496 mi_store(&b, mi_reg32(_3DPRIM_START_VERTEX),
8497 mi_mem32(ro_bo(bo, indirect->offset + 8)));
8498 if (draw->index_size) {
8499 mi_store(&b, mi_reg32(_3DPRIM_BASE_VERTEX),
8500 mi_mem32(ro_bo(bo, indirect->offset + 12)));
8501 mi_store(&b, mi_reg32(_3DPRIM_START_INSTANCE),
8502 mi_mem32(ro_bo(bo, indirect->offset + 16)));
8503 } else {
8504 mi_store(&b, mi_reg32(_3DPRIM_START_INSTANCE),
8505 mi_mem32(ro_bo(bo, indirect->offset + 12)));
8506 mi_store(&b, mi_reg32(_3DPRIM_BASE_VERTEX), mi_imm(0));
8507 }
8508 } else if (indirect->count_from_stream_output) {
8509 struct iris_stream_output_target *so =
8510 (void *) indirect->count_from_stream_output;
8511 struct iris_bo *so_bo = iris_resource_bo(so->offset.res);
8512
8513 mocs = iris_mocs(so_bo, &batch->screen->isl_dev, 0);
8514 mi_builder_set_mocs(&b, mocs);
8515
8516 iris_emit_buffer_barrier_for(batch, so_bo, IRIS_DOMAIN_OTHER_READ);
8517
8518 struct iris_address addr = ro_bo(so_bo, so->offset.offset);
8519 struct mi_value offset =
8520 mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
8521 mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
8522 mi_udiv32_imm(&b, offset, so->stride));
8523 mi_store(&b, mi_reg32(_3DPRIM_START_VERTEX), mi_imm(0));
8524 mi_store(&b, mi_reg32(_3DPRIM_BASE_VERTEX), mi_imm(0));
8525 mi_store(&b, mi_reg32(_3DPRIM_START_INSTANCE), mi_imm(0));
8526 mi_store(&b, mi_reg32(_3DPRIM_INSTANCE_COUNT),
8527 mi_imm(draw->instance_count));
8528 }
8529 }
8530
8531 iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
8532
8533 genX(maybe_emit_breakpoint)(batch, true);
8534
8535 iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
8536 prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
8537 prim.PredicateEnable = use_predicate;
8538 #if GFX_VERx10 >= 125
8539 prim.TBIMREnable = ice->state.use_tbimr;
8540 #endif
8541 if (indirect) {
8542 prim.IndirectParameterEnable = true;
8543 } else {
8544 prim.StartInstanceLocation = draw->start_instance;
8545 prim.InstanceCount = draw->instance_count;
8546 prim.VertexCountPerInstance = sc->count;
8547
8548 prim.StartVertexLocation = sc->start;
8549
8550 if (draw->index_size) {
8551 prim.BaseVertexLocation += sc->index_bias;
8552 }
8553 }
8554 }
8555
8556 genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
8557 genX(maybe_emit_breakpoint)(batch, false);
8558
8559 iris_batch_sync_region_end(batch);
8560
8561 uint32_t count = (sc) ? sc->count : 0;
8562 count *= draw->instance_count ? draw->instance_count : 1;
8563 trace_intel_end_draw(&batch->trace, count);
8564 }
8565
8566 static void
iris_upload_indirect_render_state(struct iris_context * ice,const struct pipe_draw_info * draw,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * sc)8567 iris_upload_indirect_render_state(struct iris_context *ice,
8568 const struct pipe_draw_info *draw,
8569 const struct pipe_draw_indirect_info *indirect,
8570 const struct pipe_draw_start_count_bias *sc)
8571 {
8572 #if GFX_VERx10 >= 125
8573 assert(indirect);
8574
8575 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
8576 UNUSED struct iris_screen *screen = batch->screen;
8577 UNUSED const struct intel_device_info *devinfo = screen->devinfo;
8578 const bool use_predicate =
8579 ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
8580
8581 trace_intel_begin_draw(&batch->trace);
8582
8583 if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
8584 flush_vbos(ice, batch);
8585
8586 iris_batch_sync_region_start(batch);
8587
8588 /* Always pin the binder. If we're emitting new binding table pointers,
8589 * we need it. If not, we're probably inheriting old tables via the
8590 * context, and need it anyway. Since true zero-bindings cases are
8591 * practically non-existent, just pin it and avoid last_res tracking.
8592 */
8593 iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8594 IRIS_DOMAIN_NONE);
8595
8596 if (!batch->contains_draw) {
8597 /* Re-emit constants when starting a new batch buffer in order to
8598 * work around push constant corruption on context switch.
8599 *
8600 * XXX - Provide hardware spec quotation when available.
8601 */
8602 ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS |
8603 IRIS_STAGE_DIRTY_CONSTANTS_TCS |
8604 IRIS_STAGE_DIRTY_CONSTANTS_TES |
8605 IRIS_STAGE_DIRTY_CONSTANTS_GS |
8606 IRIS_STAGE_DIRTY_CONSTANTS_FS);
8607 batch->contains_draw = true;
8608 }
8609
8610 if (!batch->contains_draw_with_next_seqno) {
8611 iris_restore_render_saved_bos(ice, batch, draw);
8612 batch->contains_draw_with_next_seqno = true;
8613 }
8614
8615 /* Wa_1306463417 - Send HS state for every primitive on gfx11.
8616 * Wa_16011107343 (same for gfx12)
8617 * We implement this by setting TCS dirty on each draw.
8618 */
8619 if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
8620 ice->shaders.prog[MESA_SHADER_TESS_CTRL]) {
8621 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS;
8622 }
8623
8624 iris_upload_dirty_render_state(ice, batch, draw, false);
8625
8626 if (draw->index_size > 0)
8627 iris_emit_index_buffer(ice, batch, draw, sc);
8628
8629 iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
8630
8631 genX(maybe_emit_breakpoint)(batch, true);
8632
8633 iris_emit_cmd(batch, GENX(EXECUTE_INDIRECT_DRAW), ind) {
8634 ind.ArgumentFormat =
8635 draw->index_size > 0 ? XI_DRAWINDEXED : XI_DRAW;
8636 ind.PredicateEnable = use_predicate;
8637 ind.TBIMREnabled = ice->state.use_tbimr;
8638 ind.MaxCount = indirect->draw_count;
8639
8640 if (indirect->buffer) {
8641 struct iris_bo *bo = iris_resource_bo(indirect->buffer);
8642 ind.ArgumentBufferStartAddress = ro_bo(bo, indirect->offset);
8643 ind.MOCS = iris_mocs(bo, &screen->isl_dev, 0);
8644 } else {
8645 ind.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
8646 }
8647
8648 if (indirect->indirect_draw_count) {
8649 struct iris_bo *draw_count_bo =
8650 iris_resource_bo(indirect->indirect_draw_count);
8651 ind.CountBufferIndirectEnable = true;
8652 ind.CountBufferAddress =
8653 ro_bo(draw_count_bo, indirect->indirect_draw_count_offset);
8654 }
8655 }
8656
8657 genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
8658 genX(maybe_emit_breakpoint)(batch, false);
8659
8660 iris_batch_sync_region_end(batch);
8661
8662 uint32_t count = (sc) ? sc->count : 0;
8663 count *= draw->instance_count ? draw->instance_count : 1;
8664 trace_intel_end_draw(&batch->trace, count);
8665 #else
8666 unreachable("Unsupported path");
8667 #endif /* GFX_VERx10 >= 125 */
8668 }
8669
8670 static void
iris_upload_indirect_shader_render_state(struct iris_context * ice,const struct pipe_draw_info * draw,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * sc)8671 iris_upload_indirect_shader_render_state(struct iris_context *ice,
8672 const struct pipe_draw_info *draw,
8673 const struct pipe_draw_indirect_info *indirect,
8674 const struct pipe_draw_start_count_bias *sc)
8675 {
8676 assert(indirect);
8677
8678 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
8679 UNUSED struct iris_screen *screen = batch->screen;
8680 UNUSED const struct intel_device_info *devinfo = screen->devinfo;
8681
8682 if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
8683 flush_vbos(ice, batch);
8684
8685 iris_batch_sync_region_start(batch);
8686
8687 /* Always pin the binder. If we're emitting new binding table pointers,
8688 * we need it. If not, we're probably inheriting old tables via the
8689 * context, and need it anyway. Since true zero-bindings cases are
8690 * practically non-existent, just pin it and avoid last_res tracking.
8691 */
8692 iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8693 IRIS_DOMAIN_NONE);
8694
8695 if (!batch->contains_draw) {
8696 if (GFX_VER == 12) {
8697 /* Re-emit constants when starting a new batch buffer in order to
8698 * work around push constant corruption on context switch.
8699 *
8700 * XXX - Provide hardware spec quotation when available.
8701 */
8702 ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS |
8703 IRIS_STAGE_DIRTY_CONSTANTS_TCS |
8704 IRIS_STAGE_DIRTY_CONSTANTS_TES |
8705 IRIS_STAGE_DIRTY_CONSTANTS_GS |
8706 IRIS_STAGE_DIRTY_CONSTANTS_FS);
8707 }
8708 batch->contains_draw = true;
8709 }
8710
8711 if (!batch->contains_draw_with_next_seqno) {
8712 iris_restore_render_saved_bos(ice, batch, draw);
8713 batch->contains_draw_with_next_seqno = true;
8714 }
8715
8716 if (draw->index_size > 0)
8717 iris_emit_index_buffer(ice, batch, draw, sc);
8718
8719 /* Make sure we have enough space to keep all the commands in the single BO
8720 * (because of the jumps)
8721 */
8722 iris_require_command_space(batch, 2000);
8723
8724 #ifndef NDEBUG
8725 struct iris_bo *command_bo = batch->bo;
8726 #endif
8727
8728 /* Jump point to generate more draw if we run out of space in the ring
8729 * buffer.
8730 */
8731 uint64_t gen_addr = iris_batch_current_address_u64(batch);
8732
8733 iris_handle_always_flush_cache(batch);
8734
8735 #if GFX_VER == 9
8736 iris_emit_pipe_control_flush(batch, "before generation",
8737 PIPE_CONTROL_VF_CACHE_INVALIDATE);
8738 #endif
8739
8740 struct iris_address params_addr;
8741 struct iris_gen_indirect_params *params =
8742 genX(emit_indirect_generate)(batch, draw, indirect, sc,
8743 ¶ms_addr);
8744
8745 iris_emit_pipe_control_flush(batch, "after generation flush",
8746 ((ice->state.vs_uses_draw_params ||
8747 ice->state.vs_uses_derived_draw_params) ?
8748 PIPE_CONTROL_VF_CACHE_INVALIDATE : 0) |
8749 PIPE_CONTROL_STALL_AT_SCOREBOARD |
8750 PIPE_CONTROL_DATA_CACHE_FLUSH |
8751 PIPE_CONTROL_CS_STALL);
8752
8753 trace_intel_begin_draw(&batch->trace);
8754
8755 /* Always pin the binder. If we're emitting new binding table pointers,
8756 * we need it. If not, we're probably inheriting old tables via the
8757 * context, and need it anyway. Since true zero-bindings cases are
8758 * practically non-existent, just pin it and avoid last_res tracking.
8759 */
8760 iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8761 IRIS_DOMAIN_NONE);
8762
8763 /* Wa_1306463417 - Send HS state for every primitive on gfx11.
8764 * Wa_16011107343 (same for gfx12)
8765 * We implement this by setting TCS dirty on each draw.
8766 */
8767 if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
8768 ice->shaders.prog[MESA_SHADER_TESS_CTRL]) {
8769 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS;
8770 }
8771
8772 iris_upload_dirty_render_state(ice, batch, draw, true);
8773
8774 iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
8775
8776 genX(maybe_emit_breakpoint)(batch, true);
8777
8778 #if GFX_VER >= 12
8779 iris_emit_cmd(batch, GENX(MI_ARB_CHECK), arb) {
8780 arb.PreParserDisableMask = true;
8781 arb.PreParserDisable = true;
8782 }
8783 #endif
8784
8785 iris_emit_cmd(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
8786 bbs.AddressSpaceIndicator = ASI_PPGTT;
8787 bbs.BatchBufferStartAddress = (struct iris_address) {
8788 .bo = ice->draw.generation.ring_bo,
8789 };
8790 }
8791
8792 /* Run the ring buffer one more time with the next set of commands */
8793 uint64_t inc_addr = iris_batch_current_address_u64(batch);
8794 {
8795 iris_emit_pipe_control_flush(batch,
8796 "post generated draws wait",
8797 PIPE_CONTROL_STALL_AT_SCOREBOARD |
8798 PIPE_CONTROL_CS_STALL);
8799
8800 struct mi_builder b;
8801 mi_builder_init(&b, batch->screen->devinfo, batch);
8802
8803 struct iris_address draw_base_addr = iris_address_add(
8804 params_addr,
8805 offsetof(struct iris_gen_indirect_params, draw_base));
8806
8807 const uint32_t mocs =
8808 iris_mocs(draw_base_addr.bo, &screen->isl_dev, 0);
8809 mi_builder_set_mocs(&b, mocs);
8810
8811 mi_store(&b, mi_mem32(draw_base_addr),
8812 mi_iadd(&b, mi_mem32(draw_base_addr),
8813 mi_imm(params->ring_count)));
8814
8815 iris_emit_pipe_control_flush(batch,
8816 "post generation base increment",
8817 PIPE_CONTROL_CS_STALL |
8818 PIPE_CONTROL_CONST_CACHE_INVALIDATE);
8819
8820 iris_emit_cmd(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
8821 bbs.AddressSpaceIndicator = ASI_PPGTT;
8822 bbs.BatchBufferStartAddress = (struct iris_address) {
8823 .offset = gen_addr,
8824 };
8825 }
8826 }
8827
8828 /* Exit of the ring buffer */
8829 uint64_t end_addr = iris_batch_current_address_u64(batch);
8830
8831 #ifndef NDEBUG
8832 assert(command_bo == batch->bo);
8833 #endif
8834
8835 genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
8836 genX(maybe_emit_breakpoint)(batch, false);
8837
8838 iris_emit_pipe_control_flush(batch,
8839 "post generated draws wait",
8840 PIPE_CONTROL_STALL_AT_SCOREBOARD |
8841 PIPE_CONTROL_CS_STALL);
8842
8843 params->gen_addr = inc_addr;
8844 params->end_addr = end_addr;
8845
8846 iris_batch_sync_region_end(batch);
8847
8848 uint32_t count = (sc) ? sc->count : 0;
8849 count *= draw->instance_count ? draw->instance_count : 1;
8850 trace_intel_end_draw(&batch->trace, count);
8851 }
8852
8853 static void
iris_load_indirect_location(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)8854 iris_load_indirect_location(struct iris_context *ice,
8855 struct iris_batch *batch,
8856 const struct pipe_grid_info *grid)
8857 {
8858 #define GPGPU_DISPATCHDIMX 0x2500
8859 #define GPGPU_DISPATCHDIMY 0x2504
8860 #define GPGPU_DISPATCHDIMZ 0x2508
8861
8862 assert(grid->indirect);
8863
8864 struct iris_state_ref *grid_size = &ice->state.grid_size;
8865 struct iris_bo *bo = iris_resource_bo(grid_size->res);
8866 struct mi_builder b;
8867 mi_builder_init(&b, batch->screen->devinfo, batch);
8868 struct mi_value size_x = mi_mem32(ro_bo(bo, grid_size->offset + 0));
8869 struct mi_value size_y = mi_mem32(ro_bo(bo, grid_size->offset + 4));
8870 struct mi_value size_z = mi_mem32(ro_bo(bo, grid_size->offset + 8));
8871 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
8872 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
8873 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
8874 }
8875
iris_emit_indirect_dispatch_supported(const struct intel_device_info * devinfo)8876 static bool iris_emit_indirect_dispatch_supported(const struct intel_device_info *devinfo)
8877 {
8878 // TODO: Swizzling X and Y workgroup sizes is not supported in execute indirect dispatch
8879 return devinfo->has_indirect_unroll;
8880 }
8881
8882 #if GFX_VERx10 >= 125
8883
iris_emit_execute_indirect_dispatch(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid,const struct GENX (INTERFACE_DESCRIPTOR_DATA)idd)8884 static void iris_emit_execute_indirect_dispatch(struct iris_context *ice,
8885 struct iris_batch *batch,
8886 const struct pipe_grid_info *grid,
8887 const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd)
8888 {
8889 const struct iris_screen *screen = batch->screen;
8890 struct iris_compiled_shader *shader =
8891 ice->shaders.prog[MESA_SHADER_COMPUTE];
8892 const struct iris_cs_data *cs_data = iris_cs_data(shader);
8893 const struct intel_cs_dispatch_info dispatch =
8894 iris_get_cs_dispatch_info(screen->devinfo, shader, grid->block);
8895 struct iris_bo *indirect = iris_resource_bo(grid->indirect);
8896 const int dispatch_size = dispatch.simd_size / 16;
8897
8898 struct GENX(COMPUTE_WALKER_BODY) body = {};
8899 body.SIMDSize = dispatch_size;
8900 body.MessageSIMD = dispatch_size;
8901 body.GenerateLocalID = cs_data->generate_local_id != 0;
8902 body.EmitLocal = cs_data->generate_local_id;
8903 body.WalkOrder = cs_data->walk_order;
8904 body.TileLayout = cs_data->walk_order == INTEL_WALK_ORDER_YXZ ?
8905 TileY32bpe : Linear;
8906 body.LocalXMaximum = grid->block[0] - 1;
8907 body.LocalYMaximum = grid->block[1] - 1;
8908 body.LocalZMaximum = grid->block[2] - 1;
8909 body.ExecutionMask = dispatch.right_mask;
8910 body.PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
8911 body.InterfaceDescriptor = idd;
8912
8913 struct iris_address indirect_bo = ro_bo(indirect, grid->indirect_offset);
8914 iris_emit_cmd(batch, GENX(EXECUTE_INDIRECT_DISPATCH), ind) {
8915 ind.PredicateEnable =
8916 ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
8917 ind.MaxCount = 1;
8918 ind.COMPUTE_WALKER_BODY = body;
8919 ind.ArgumentBufferStartAddress = indirect_bo;
8920 ind.MOCS =
8921 iris_mocs(indirect_bo.bo, &screen->isl_dev, 0);
8922 }
8923 }
8924
8925 static void
iris_upload_compute_walker(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)8926 iris_upload_compute_walker(struct iris_context *ice,
8927 struct iris_batch *batch,
8928 const struct pipe_grid_info *grid)
8929 {
8930 const uint64_t stage_dirty = ice->state.stage_dirty;
8931 struct iris_screen *screen = batch->screen;
8932 const struct intel_device_info *devinfo = screen->devinfo;
8933 struct iris_binder *binder = &ice->state.binder;
8934 struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
8935 struct iris_compiled_shader *shader =
8936 ice->shaders.prog[MESA_SHADER_COMPUTE];
8937 const struct iris_cs_data *cs_data = iris_cs_data(shader);
8938 const struct intel_cs_dispatch_info dispatch =
8939 iris_get_cs_dispatch_info(devinfo, shader, grid->block);
8940
8941 trace_intel_begin_compute(&batch->trace);
8942
8943 if (stage_dirty & IRIS_STAGE_DIRTY_CS) {
8944 iris_emit_cmd(batch, GENX(CFE_STATE), cfe) {
8945 cfe.MaximumNumberofThreads =
8946 devinfo->max_cs_threads * devinfo->subslice_total;
8947 uint32_t scratch_addr = pin_scratch_space(ice, batch, shader,
8948 MESA_SHADER_COMPUTE);
8949 cfe.ScratchSpaceBuffer = scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;
8950 }
8951 }
8952
8953 struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {};
8954 idd.KernelStartPointer = KSP(shader);
8955 idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
8956 idd.SharedLocalMemorySize =
8957 intel_compute_slm_encode_size(GFX_VER, shader->total_shared);
8958 idd.PreferredSLMAllocationSize =
8959 intel_compute_preferred_slm_calc_encode_size(devinfo,
8960 shader->total_shared,
8961 dispatch.group_size,
8962 dispatch.simd_size);
8963 idd.SamplerStatePointer = shs->sampler_table.offset;
8964 idd.SamplerCount = encode_sampler_count(shader),
8965 idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];
8966 /* Typically set to 0 to avoid prefetching on every thread dispatch. */
8967 idd.BindingTableEntryCount = devinfo->verx10 == 125 ?
8968 0 : MIN2(shader->bt.size_bytes / 4, 31);
8969 idd.NumberOfBarriers = cs_data->uses_barrier;
8970
8971 iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
8972
8973 if (iris_emit_indirect_dispatch_supported(devinfo) && grid->indirect) {
8974 iris_emit_execute_indirect_dispatch(ice, batch, grid, idd);
8975 } else {
8976 if (grid->indirect)
8977 iris_load_indirect_location(ice, batch, grid);
8978
8979 iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
8980
8981 ice->utrace.last_compute_walker =
8982 iris_emit_dwords(batch, GENX(COMPUTE_WALKER_length));
8983 _iris_pack_command(batch, GENX(COMPUTE_WALKER),
8984 ice->utrace.last_compute_walker, cw) {
8985 cw.IndirectParameterEnable = grid->indirect;
8986 cw.SIMDSize = dispatch.simd_size / 16;
8987 cw.MessageSIMD = dispatch.simd_size / 16;
8988 cw.LocalXMaximum = grid->block[0] - 1;
8989 cw.LocalYMaximum = grid->block[1] - 1;
8990 cw.LocalZMaximum = grid->block[2] - 1;
8991 cw.ThreadGroupIDXDimension = grid->grid[0];
8992 cw.ThreadGroupIDYDimension = grid->grid[1];
8993 cw.ThreadGroupIDZDimension = grid->grid[2];
8994 cw.ExecutionMask = dispatch.right_mask;
8995 cw.PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
8996 cw.InterfaceDescriptor = idd;
8997
8998 #if GFX_VERx10 >= 125
8999 cw.GenerateLocalID = cs_data->generate_local_id != 0;
9000 cw.EmitLocal = cs_data->generate_local_id;
9001 cw.WalkOrder = cs_data->walk_order;
9002 cw.TileLayout = cs_data->walk_order == INTEL_WALK_ORDER_YXZ ?
9003 TileY32bpe : Linear;
9004 #endif
9005
9006 assert(iris_cs_push_const_total_size(shader, dispatch.threads) == 0);
9007 }
9008 }
9009
9010 trace_intel_end_compute(&batch->trace, grid->grid[0], grid->grid[1], grid->grid[2]);
9011 }
9012
9013 #else /* #if GFX_VERx10 >= 125 */
9014
9015 static void
iris_upload_gpgpu_walker(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)9016 iris_upload_gpgpu_walker(struct iris_context *ice,
9017 struct iris_batch *batch,
9018 const struct pipe_grid_info *grid)
9019 {
9020 const uint64_t stage_dirty = ice->state.stage_dirty;
9021 struct iris_screen *screen = batch->screen;
9022 const struct intel_device_info *devinfo = screen->devinfo;
9023 struct iris_binder *binder = &ice->state.binder;
9024 struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
9025 struct iris_uncompiled_shader *ish =
9026 ice->shaders.uncompiled[MESA_SHADER_COMPUTE];
9027 struct iris_compiled_shader *shader =
9028 ice->shaders.prog[MESA_SHADER_COMPUTE];
9029 struct iris_cs_data *cs_data = iris_cs_data(shader);
9030 const struct intel_cs_dispatch_info dispatch =
9031 iris_get_cs_dispatch_info(screen->devinfo, shader, grid->block);
9032
9033 trace_intel_begin_compute(&batch->trace);
9034
9035 if ((stage_dirty & IRIS_STAGE_DIRTY_CS) ||
9036 cs_data->local_size[0] == 0 /* Variable local group size */) {
9037 /* The MEDIA_VFE_STATE documentation for Gfx8+ says:
9038 *
9039 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
9040 * the only bits that are changed are scoreboard related: Scoreboard
9041 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta. For
9042 * these scoreboard related states, a MEDIA_STATE_FLUSH is
9043 * sufficient."
9044 */
9045 iris_emit_pipe_control_flush(batch,
9046 "workaround: stall before MEDIA_VFE_STATE",
9047 PIPE_CONTROL_CS_STALL);
9048
9049 iris_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
9050 if (shader->total_scratch) {
9051 uint32_t scratch_addr =
9052 pin_scratch_space(ice, batch, shader, MESA_SHADER_COMPUTE);
9053
9054 vfe.PerThreadScratchSpace = ffs(shader->total_scratch) - 11;
9055 vfe.ScratchSpaceBasePointer =
9056 rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);
9057 }
9058
9059 vfe.MaximumNumberofThreads =
9060 devinfo->max_cs_threads * devinfo->subslice_total - 1;
9061 #if GFX_VER < 11
9062 vfe.ResetGatewayTimer =
9063 Resettingrelativetimerandlatchingtheglobaltimestamp;
9064 #endif
9065 #if GFX_VER == 8
9066 vfe.BypassGatewayControl = true;
9067 #endif
9068 vfe.NumberofURBEntries = 2;
9069 vfe.URBEntryAllocationSize = 2;
9070
9071 vfe.CURBEAllocationSize =
9072 ALIGN(cs_data->push.per_thread.regs * dispatch.threads +
9073 cs_data->push.cross_thread.regs, 2);
9074 }
9075 }
9076
9077 /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
9078 if ((stage_dirty & IRIS_STAGE_DIRTY_CS) ||
9079 cs_data->local_size[0] == 0 /* Variable local group size */) {
9080 uint32_t curbe_data_offset = 0;
9081 assert(cs_data->push.cross_thread.dwords == 0 &&
9082 cs_data->push.per_thread.dwords == 1 &&
9083 cs_data->first_param_is_builtin_subgroup_id);
9084 const unsigned push_const_size =
9085 iris_cs_push_const_total_size(shader, dispatch.threads);
9086 uint32_t *curbe_data_map =
9087 stream_state(batch, ice->state.dynamic_uploader,
9088 &ice->state.last_res.cs_thread_ids,
9089 ALIGN(push_const_size, 64), 64,
9090 &curbe_data_offset);
9091 assert(curbe_data_map);
9092 memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
9093 iris_fill_cs_push_const_buffer(screen, shader, dispatch.threads,
9094 curbe_data_map);
9095
9096 iris_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
9097 curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
9098 curbe.CURBEDataStartAddress = curbe_data_offset;
9099 }
9100 }
9101
9102 for (unsigned i = 0; i < IRIS_MAX_GLOBAL_BINDINGS; i++) {
9103 struct pipe_resource *res = ice->state.global_bindings[i];
9104 if (!res)
9105 break;
9106
9107 iris_use_pinned_bo(batch, iris_resource_bo(res),
9108 true, IRIS_DOMAIN_NONE);
9109 }
9110
9111 if (stage_dirty & (IRIS_STAGE_DIRTY_SAMPLER_STATES_CS |
9112 IRIS_STAGE_DIRTY_BINDINGS_CS |
9113 IRIS_STAGE_DIRTY_CONSTANTS_CS |
9114 IRIS_STAGE_DIRTY_CS)) {
9115 uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
9116
9117 iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
9118 idd.SharedLocalMemorySize =
9119 intel_compute_slm_encode_size(GFX_VER, ish->kernel_shared_size + grid->variable_shared_mem);
9120 idd.KernelStartPointer =
9121 KSP(shader) + iris_cs_data_prog_offset(cs_data, dispatch.simd_size);
9122 idd.SamplerStatePointer = shs->sampler_table.offset;
9123 idd.BindingTablePointer =
9124 binder->bt_offset[MESA_SHADER_COMPUTE] >> IRIS_BT_OFFSET_SHIFT;
9125 idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
9126 }
9127
9128 for (int i = 0; i < GENX(INTERFACE_DESCRIPTOR_DATA_length); i++)
9129 desc[i] |= ((uint32_t *) shader->derived_data)[i];
9130
9131 iris_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
9132 load.InterfaceDescriptorTotalLength =
9133 GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
9134 load.InterfaceDescriptorDataStartAddress =
9135 emit_state(batch, ice->state.dynamic_uploader,
9136 &ice->state.last_res.cs_desc, desc, sizeof(desc), 64);
9137 }
9138 }
9139
9140 if (grid->indirect)
9141 iris_load_indirect_location(ice, batch, grid);
9142
9143 iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
9144
9145 iris_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
9146 ggw.IndirectParameterEnable = grid->indirect != NULL;
9147 ggw.SIMDSize = dispatch.simd_size / 16;
9148 ggw.ThreadDepthCounterMaximum = 0;
9149 ggw.ThreadHeightCounterMaximum = 0;
9150 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
9151 ggw.ThreadGroupIDXDimension = grid->grid[0];
9152 ggw.ThreadGroupIDYDimension = grid->grid[1];
9153 ggw.ThreadGroupIDZDimension = grid->grid[2];
9154 ggw.RightExecutionMask = dispatch.right_mask;
9155 ggw.BottomExecutionMask = 0xffffffff;
9156 }
9157
9158 iris_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
9159
9160 trace_intel_end_compute(&batch->trace, grid->grid[0], grid->grid[1], grid->grid[2]);
9161 }
9162
9163 #endif /* #if GFX_VERx10 >= 125 */
9164
9165 static void
iris_upload_compute_state(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)9166 iris_upload_compute_state(struct iris_context *ice,
9167 struct iris_batch *batch,
9168 const struct pipe_grid_info *grid)
9169 {
9170 struct iris_screen *screen = batch->screen;
9171 const uint64_t stage_dirty = ice->state.stage_dirty;
9172 struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
9173 struct iris_compiled_shader *shader =
9174 ice->shaders.prog[MESA_SHADER_COMPUTE];
9175 struct iris_border_color_pool *border_color_pool =
9176 iris_bufmgr_get_border_color_pool(screen->bufmgr);
9177
9178 iris_batch_sync_region_start(batch);
9179
9180 /* Always pin the binder. If we're emitting new binding table pointers,
9181 * we need it. If not, we're probably inheriting old tables via the
9182 * context, and need it anyway. Since true zero-bindings cases are
9183 * practically non-existent, just pin it and avoid last_res tracking.
9184 */
9185 iris_use_pinned_bo(batch, ice->state.binder.bo, false, IRIS_DOMAIN_NONE);
9186
9187 if (((stage_dirty & IRIS_STAGE_DIRTY_CONSTANTS_CS) &&
9188 shs->sysvals_need_upload) ||
9189 shader->kernel_input_size > 0)
9190 upload_sysvals(ice, MESA_SHADER_COMPUTE, grid);
9191
9192 if (stage_dirty & IRIS_STAGE_DIRTY_BINDINGS_CS)
9193 iris_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
9194
9195 if (stage_dirty & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS)
9196 iris_upload_sampler_states(ice, MESA_SHADER_COMPUTE);
9197
9198 iris_use_optional_res(batch, shs->sampler_table.res, false,
9199 IRIS_DOMAIN_NONE);
9200 iris_use_pinned_bo(batch, iris_resource_bo(shader->assembly.res), false,
9201 IRIS_DOMAIN_NONE);
9202
9203 if (ice->state.need_border_colors)
9204 iris_use_pinned_bo(batch, border_color_pool->bo, false,
9205 IRIS_DOMAIN_NONE);
9206
9207 #if GFX_VER >= 12
9208 genX(invalidate_aux_map_state)(batch);
9209 #endif
9210
9211 #if GFX_VERx10 >= 125
9212 iris_upload_compute_walker(ice, batch, grid);
9213 #else
9214 iris_upload_gpgpu_walker(ice, batch, grid);
9215 #endif
9216
9217 if (!batch->contains_draw_with_next_seqno) {
9218 iris_restore_compute_saved_bos(ice, batch, grid);
9219 batch->contains_draw_with_next_seqno = batch->contains_draw = true;
9220 }
9221
9222 iris_batch_sync_region_end(batch);
9223 }
9224
9225 /**
9226 * State module teardown.
9227 */
9228 static void
iris_destroy_state(struct iris_context * ice)9229 iris_destroy_state(struct iris_context *ice)
9230 {
9231 struct iris_genx_state *genx = ice->state.genx;
9232
9233 pipe_resource_reference(&ice->state.pixel_hashing_tables, NULL);
9234
9235 pipe_resource_reference(&ice->draw.draw_params.res, NULL);
9236 pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
9237 pipe_resource_reference(&ice->draw.generation.params.res, NULL);
9238 pipe_resource_reference(&ice->draw.generation.vertices.res, NULL);
9239
9240 /* Loop over all VBOs, including ones for draw parameters */
9241 for (unsigned i = 0; i < ARRAY_SIZE(genx->vertex_buffers); i++) {
9242 pipe_resource_reference(&genx->vertex_buffers[i].resource, NULL);
9243 }
9244
9245 free(ice->state.genx);
9246
9247 for (int i = 0; i < 4; i++) {
9248 pipe_so_target_reference(&ice->state.so_target[i], NULL);
9249 }
9250
9251 util_unreference_framebuffer_state(&ice->state.framebuffer);
9252
9253 for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
9254 struct iris_shader_state *shs = &ice->state.shaders[stage];
9255 pipe_resource_reference(&shs->sampler_table.res, NULL);
9256 for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
9257 pipe_resource_reference(&shs->constbuf[i].buffer, NULL);
9258 pipe_resource_reference(&shs->constbuf_surf_state[i].res, NULL);
9259 }
9260 for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
9261 pipe_resource_reference(&shs->image[i].base.resource, NULL);
9262 pipe_resource_reference(&shs->image[i].surface_state.ref.res, NULL);
9263 free(shs->image[i].surface_state.cpu);
9264 }
9265 for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
9266 pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
9267 pipe_resource_reference(&shs->ssbo_surf_state[i].res, NULL);
9268 }
9269 for (int i = 0; i < IRIS_MAX_TEXTURES; i++) {
9270 pipe_sampler_view_reference((struct pipe_sampler_view **)
9271 &shs->textures[i], NULL);
9272 }
9273 }
9274
9275 pipe_resource_reference(&ice->state.grid_size.res, NULL);
9276 pipe_resource_reference(&ice->state.grid_surf_state.res, NULL);
9277
9278 pipe_resource_reference(&ice->state.null_fb.res, NULL);
9279 pipe_resource_reference(&ice->state.unbound_tex.res, NULL);
9280
9281 pipe_resource_reference(&ice->state.last_res.cc_vp, NULL);
9282 pipe_resource_reference(&ice->state.last_res.sf_cl_vp, NULL);
9283 pipe_resource_reference(&ice->state.last_res.color_calc, NULL);
9284 pipe_resource_reference(&ice->state.last_res.scissor, NULL);
9285 pipe_resource_reference(&ice->state.last_res.blend, NULL);
9286 pipe_resource_reference(&ice->state.last_res.index_buffer, NULL);
9287 pipe_resource_reference(&ice->state.last_res.cs_thread_ids, NULL);
9288 pipe_resource_reference(&ice->state.last_res.cs_desc, NULL);
9289 }
9290
9291 /* ------------------------------------------------------------------- */
9292
9293 static void
iris_rebind_buffer(struct iris_context * ice,struct iris_resource * res)9294 iris_rebind_buffer(struct iris_context *ice,
9295 struct iris_resource *res)
9296 {
9297 struct pipe_context *ctx = &ice->ctx;
9298 struct iris_genx_state *genx = ice->state.genx;
9299
9300 assert(res->base.b.target == PIPE_BUFFER);
9301
9302 /* Buffers can't be framebuffer attachments, nor display related,
9303 * and we don't have upstream Clover support.
9304 */
9305 assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
9306 PIPE_BIND_RENDER_TARGET |
9307 PIPE_BIND_BLENDABLE |
9308 PIPE_BIND_DISPLAY_TARGET |
9309 PIPE_BIND_CURSOR |
9310 PIPE_BIND_COMPUTE_RESOURCE |
9311 PIPE_BIND_GLOBAL)));
9312
9313 if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
9314 uint64_t bound_vbs = ice->state.bound_vertex_buffers;
9315 while (bound_vbs) {
9316 const int i = u_bit_scan64(&bound_vbs);
9317 struct iris_vertex_buffer_state *state = &genx->vertex_buffers[i];
9318
9319 /* Update the CPU struct */
9320 STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_start) == 32);
9321 STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) == 64);
9322 uint64_t *addr = (uint64_t *) &state->state[1];
9323 struct iris_bo *bo = iris_resource_bo(state->resource);
9324
9325 if (*addr != bo->address + state->offset) {
9326 *addr = bo->address + state->offset;
9327 ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS |
9328 IRIS_DIRTY_VERTEX_BUFFER_FLUSHES;
9329 }
9330 }
9331 }
9332
9333 /* We don't need to handle PIPE_BIND_INDEX_BUFFER here: we re-emit
9334 * the 3DSTATE_INDEX_BUFFER packet whenever the address changes.
9335 *
9336 * There is also no need to handle these:
9337 * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
9338 * - PIPE_BIND_QUERY_BUFFER (no persistent state references)
9339 */
9340
9341 if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
9342 uint32_t *so_buffers = genx->so_buffers;
9343 for (unsigned i = 0; i < 4; i++,
9344 so_buffers += GENX(3DSTATE_SO_BUFFER_length)) {
9345
9346 /* There are no other fields in bits 127:64 */
9347 uint64_t *addr = (uint64_t *) &so_buffers[2];
9348 STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_SurfaceBaseAddress_start) == 66);
9349 STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_SurfaceBaseAddress_bits) == 46);
9350
9351 struct pipe_stream_output_target *tgt = ice->state.so_target[i];
9352 if (tgt) {
9353 struct iris_bo *bo = iris_resource_bo(tgt->buffer);
9354 if (*addr != bo->address + tgt->buffer_offset) {
9355 *addr = bo->address + tgt->buffer_offset;
9356 ice->state.dirty |= IRIS_DIRTY_SO_BUFFERS;
9357 }
9358 }
9359 }
9360 }
9361
9362 for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
9363 struct iris_shader_state *shs = &ice->state.shaders[s];
9364 enum pipe_shader_type p_stage = stage_to_pipe(s);
9365
9366 if (!(res->bind_stages & (1 << s)))
9367 continue;
9368
9369 if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
9370 /* Skip constant buffer 0, it's for regular uniforms, not UBOs */
9371 uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
9372 while (bound_cbufs) {
9373 const int i = u_bit_scan(&bound_cbufs);
9374 struct pipe_shader_buffer *cbuf = &shs->constbuf[i];
9375 struct iris_state_ref *surf_state = &shs->constbuf_surf_state[i];
9376
9377 if (res->bo == iris_resource_bo(cbuf->buffer)) {
9378 pipe_resource_reference(&surf_state->res, NULL);
9379 shs->dirty_cbufs |= 1u << i;
9380 ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
9381 IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
9382 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << s;
9383 }
9384 }
9385 }
9386
9387 if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
9388 uint32_t bound_ssbos = shs->bound_ssbos;
9389 while (bound_ssbos) {
9390 const int i = u_bit_scan(&bound_ssbos);
9391 struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
9392
9393 if (res->bo == iris_resource_bo(ssbo->buffer)) {
9394 struct pipe_shader_buffer buf = {
9395 .buffer = &res->base.b,
9396 .buffer_offset = ssbo->buffer_offset,
9397 .buffer_size = ssbo->buffer_size,
9398 };
9399 iris_set_shader_buffers(ctx, p_stage, i, 1, &buf,
9400 (shs->writable_ssbos >> i) & 1);
9401 }
9402 }
9403 }
9404
9405 if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
9406 int i;
9407 BITSET_FOREACH_SET(i, shs->bound_sampler_views, IRIS_MAX_TEXTURES) {
9408 struct iris_sampler_view *isv = shs->textures[i];
9409 struct iris_bo *bo = isv->res->bo;
9410
9411 if (update_surface_state_addrs(ice->state.surface_uploader,
9412 &isv->surface_state, bo)) {
9413 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << s;
9414 }
9415 }
9416 }
9417
9418 if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
9419 uint64_t bound_image_views = shs->bound_image_views;
9420 while (bound_image_views) {
9421 const int i = u_bit_scan64(&bound_image_views);
9422 struct iris_image_view *iv = &shs->image[i];
9423 struct iris_bo *bo = iris_resource_bo(iv->base.resource);
9424
9425 if (update_surface_state_addrs(ice->state.surface_uploader,
9426 &iv->surface_state, bo)) {
9427 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << s;
9428 }
9429 }
9430 }
9431 }
9432 }
9433
9434 /* ------------------------------------------------------------------- */
9435
9436 /**
9437 * Introduce a batch synchronization boundary, and update its cache coherency
9438 * status to reflect the execution of a PIPE_CONTROL command with the
9439 * specified flags.
9440 */
9441 static void
batch_mark_sync_for_pipe_control(struct iris_batch * batch,uint32_t flags)9442 batch_mark_sync_for_pipe_control(struct iris_batch *batch, uint32_t flags)
9443 {
9444 const struct intel_device_info *devinfo = batch->screen->devinfo;
9445
9446 iris_batch_sync_boundary(batch);
9447
9448 if ((flags & PIPE_CONTROL_CS_STALL)) {
9449 if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH))
9450 iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_RENDER_WRITE);
9451
9452 if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))
9453 iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);
9454
9455 if ((flags & PIPE_CONTROL_TILE_CACHE_FLUSH)) {
9456 /* A tile cache flush makes any C/Z data in L3 visible to memory. */
9457 const unsigned c = IRIS_DOMAIN_RENDER_WRITE;
9458 const unsigned z = IRIS_DOMAIN_DEPTH_WRITE;
9459 batch->coherent_seqnos[c][c] = batch->l3_coherent_seqnos[c];
9460 batch->coherent_seqnos[z][z] = batch->l3_coherent_seqnos[z];
9461 }
9462
9463 if (flags & (PIPE_CONTROL_FLUSH_HDC | PIPE_CONTROL_DATA_CACHE_FLUSH)) {
9464 /* HDC and DC flushes both flush the data cache out to L3 */
9465 iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DATA_WRITE);
9466 }
9467
9468 if ((flags & PIPE_CONTROL_DATA_CACHE_FLUSH)) {
9469 /* A DC flush also flushes L3 data cache lines out to memory. */
9470 const unsigned i = IRIS_DOMAIN_DATA_WRITE;
9471 batch->coherent_seqnos[i][i] = batch->l3_coherent_seqnos[i];
9472 }
9473
9474 if ((flags & PIPE_CONTROL_FLUSH_ENABLE))
9475 iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_WRITE);
9476
9477 if ((flags & (PIPE_CONTROL_CACHE_FLUSH_BITS |
9478 PIPE_CONTROL_STALL_AT_SCOREBOARD))) {
9479 iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_VF_READ);
9480 iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_SAMPLER_READ);
9481 iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_PULL_CONSTANT_READ);
9482 iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_READ);
9483 }
9484 }
9485
9486 if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH))
9487 iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_RENDER_WRITE);
9488
9489 if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))
9490 iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);
9491
9492 if (flags & (PIPE_CONTROL_FLUSH_HDC | PIPE_CONTROL_DATA_CACHE_FLUSH))
9493 iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_DATA_WRITE);
9494
9495 if ((flags & PIPE_CONTROL_FLUSH_ENABLE))
9496 iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_OTHER_WRITE);
9497
9498 if ((flags & PIPE_CONTROL_VF_CACHE_INVALIDATE))
9499 iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_VF_READ);
9500
9501 if ((flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE))
9502 iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_SAMPLER_READ);
9503
9504 /* Technically, to invalidate IRIS_DOMAIN_PULL_CONSTANT_READ, we need
9505 * both "Constant Cache Invalidate" and either "Texture Cache Invalidate"
9506 * or "Data Cache Flush" set, depending on the setting of
9507 * iris_indirect_ubos_use_sampler().
9508 *
9509 * However, "Data Cache Flush" and "Constant Cache Invalidate" will never
9510 * appear in the same PIPE_CONTROL command, because one is bottom-of-pipe
9511 * while the other is top-of-pipe. Because we only look at one flush at
9512 * a time, we won't see both together.
9513 *
9514 * To deal with this, we mark it as invalidated when the constant cache
9515 * is invalidated, and trust the callers to also flush the other related
9516 * cache correctly at the same time.
9517 */
9518 if ((flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE))
9519 iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_PULL_CONSTANT_READ);
9520
9521 /* IRIS_DOMAIN_OTHER_READ no longer uses any caches. */
9522
9523 if ((flags & PIPE_CONTROL_L3_RO_INVALIDATE_BITS) == PIPE_CONTROL_L3_RO_INVALIDATE_BITS) {
9524 /* If we just invalidated the read-only lines of L3, then writes from non-L3-coherent
9525 * domains will now be visible to those L3 clients.
9526 */
9527 for (unsigned i = 0; i < NUM_IRIS_DOMAINS; i++) {
9528 if (!iris_domain_is_l3_coherent(devinfo, i))
9529 batch->l3_coherent_seqnos[i] = batch->coherent_seqnos[i][i];
9530 }
9531 }
9532 }
9533
9534 static unsigned
flags_to_post_sync_op(uint32_t flags)9535 flags_to_post_sync_op(uint32_t flags)
9536 {
9537 if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
9538 return WriteImmediateData;
9539
9540 if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
9541 return WritePSDepthCount;
9542
9543 if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
9544 return WriteTimestamp;
9545
9546 return 0;
9547 }
9548
9549 /**
9550 * Do the given flags have a Post Sync or LRI Post Sync operation?
9551 */
9552 static enum pipe_control_flags
get_post_sync_flags(enum pipe_control_flags flags)9553 get_post_sync_flags(enum pipe_control_flags flags)
9554 {
9555 flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
9556 PIPE_CONTROL_WRITE_DEPTH_COUNT |
9557 PIPE_CONTROL_WRITE_TIMESTAMP |
9558 PIPE_CONTROL_LRI_POST_SYNC_OP;
9559
9560 /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
9561 * "LRI Post Sync Operation". So more than one bit set would be illegal.
9562 */
9563 assert(util_bitcount(flags) <= 1);
9564
9565 return flags;
9566 }
9567
9568 #define IS_COMPUTE_PIPELINE(batch) (batch->name == IRIS_BATCH_COMPUTE)
9569
9570 /**
9571 * Emit a series of PIPE_CONTROL commands, taking into account any
9572 * workarounds necessary to actually accomplish the caller's request.
9573 *
9574 * Unless otherwise noted, spec quotations in this function come from:
9575 *
9576 * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
9577 * Restrictions for PIPE_CONTROL.
9578 *
9579 * You should not use this function directly. Use the helpers in
9580 * iris_pipe_control.c instead, which may split the pipe control further.
9581 */
9582 static void
iris_emit_raw_pipe_control(struct iris_batch * batch,const char * reason,uint32_t flags,struct iris_bo * bo,uint32_t offset,uint64_t imm)9583 iris_emit_raw_pipe_control(struct iris_batch *batch,
9584 const char *reason,
9585 uint32_t flags,
9586 struct iris_bo *bo,
9587 uint32_t offset,
9588 uint64_t imm)
9589 {
9590 UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
9591 enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
9592 enum pipe_control_flags non_lri_post_sync_flags =
9593 post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
9594
9595 #if GFX_VER >= 12
9596 if (batch->name == IRIS_BATCH_BLITTER) {
9597 batch_mark_sync_for_pipe_control(batch, flags);
9598 iris_batch_sync_region_start(batch);
9599
9600 assert(!(flags & PIPE_CONTROL_WRITE_DEPTH_COUNT));
9601
9602 /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
9603 if (intel_needs_workaround(batch->screen->devinfo, 16018063123))
9604 batch_emit_fast_color_dummy_blit(batch);
9605
9606 /* The blitter doesn't actually use PIPE_CONTROL; rather it uses the
9607 * MI_FLUSH_DW command. However, all of our code is set up to flush
9608 * via emitting a pipe control, so we just translate it at this point,
9609 * even if it is a bit hacky.
9610 */
9611 iris_emit_cmd(batch, GENX(MI_FLUSH_DW), fd) {
9612 fd.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
9613 fd.ImmediateData = imm;
9614 fd.PostSyncOperation = flags_to_post_sync_op(flags);
9615 #if GFX_VERx10 >= 125
9616 /* TODO: This may not always be necessary */
9617 fd.FlushCCS = true;
9618 #endif
9619 }
9620 iris_batch_sync_region_end(batch);
9621 return;
9622 }
9623 #endif
9624
9625 /* The "L3 Read Only Cache Invalidation Bit" docs say it "controls the
9626 * invalidation of the Geometry streams cached in L3 cache at the top
9627 * of the pipe". In other words, index & vertex data that gets cached
9628 * in L3 when VERTEX_BUFFER_STATE::L3BypassDisable is set.
9629 *
9630 * Normally, invalidating L1/L2 read-only caches also invalidate their
9631 * related L3 cachelines, but this isn't the case for the VF cache.
9632 * Emulate it by setting the L3 Read Only bit when doing a VF invalidate.
9633 */
9634 if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)
9635 flags |= PIPE_CONTROL_L3_READ_ONLY_CACHE_INVALIDATE;
9636
9637 /* Recursive PIPE_CONTROL workarounds --------------------------------
9638 * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
9639 *
9640 * We do these first because we want to look at the original operation,
9641 * rather than any workarounds we set.
9642 */
9643 if (GFX_VER == 9 && (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {
9644 /* The PIPE_CONTROL "VF Cache Invalidation Enable" bit description
9645 * lists several workarounds:
9646 *
9647 * "Project: SKL, KBL, BXT
9648 *
9649 * If the VF Cache Invalidation Enable is set to a 1 in a
9650 * PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields
9651 * sets to 0, with the VF Cache Invalidation Enable set to 0
9652 * needs to be sent prior to the PIPE_CONTROL with VF Cache
9653 * Invalidation Enable set to a 1."
9654 */
9655 iris_emit_raw_pipe_control(batch,
9656 "workaround: recursive VF cache invalidate",
9657 0, NULL, 0, 0);
9658 }
9659
9660 if (GFX_VER == 9 && IS_COMPUTE_PIPELINE(batch) && post_sync_flags) {
9661 /* Project: SKL / Argument: LRI Post Sync Operation [23]
9662 *
9663 * "PIPECONTROL command with “Command Streamer Stall Enable” must be
9664 * programmed prior to programming a PIPECONTROL command with "LRI
9665 * Post Sync Operation" in GPGPU mode of operation (i.e when
9666 * PIPELINE_SELECT command is set to GPGPU mode of operation)."
9667 *
9668 * The same text exists a few rows below for Post Sync Op.
9669 */
9670 iris_emit_raw_pipe_control(batch,
9671 "workaround: CS stall before gpgpu post-sync",
9672 PIPE_CONTROL_CS_STALL, bo, offset, imm);
9673 }
9674
9675 /* "Flush Types" workarounds ---------------------------------------------
9676 * We do these now because they may add post-sync operations or CS stalls.
9677 */
9678
9679 if (GFX_VER < 11 && flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
9680 /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
9681 *
9682 * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
9683 * 'Write PS Depth Count' or 'Write Timestamp'."
9684 */
9685 if (!bo) {
9686 flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
9687 post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
9688 non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
9689 bo = batch->screen->workaround_address.bo;
9690 offset = batch->screen->workaround_address.offset;
9691 }
9692 }
9693
9694 if (flags & PIPE_CONTROL_DEPTH_STALL) {
9695 /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
9696 *
9697 * "This bit must be DISABLED for operations other than writing
9698 * PS_DEPTH_COUNT."
9699 *
9700 * This seems like nonsense. An Ivybridge workaround requires us to
9701 * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
9702 * operation. Gfx8+ requires us to emit depth stalls and depth cache
9703 * flushes together. So, it's hard to imagine this means anything other
9704 * than "we originally intended this to be used for PS_DEPTH_COUNT".
9705 *
9706 * We ignore the supposed restriction and do nothing.
9707 */
9708 }
9709
9710 if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
9711 PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
9712 /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
9713 *
9714 * "This bit must be DISABLED for End-of-pipe (Read) fences,
9715 * PS_DEPTH_COUNT or TIMESTAMP queries."
9716 *
9717 * TODO: Implement end-of-pipe checking.
9718 */
9719 assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
9720 PIPE_CONTROL_WRITE_TIMESTAMP)));
9721 }
9722
9723 if (GFX_VER < 11 && (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
9724 /* From the PIPE_CONTROL instruction table, bit 1:
9725 *
9726 * "This bit is ignored if Depth Stall Enable is set.
9727 * Further, the render cache is not flushed even if Write Cache
9728 * Flush Enable bit is set."
9729 *
9730 * We assert that the caller doesn't do this combination, to try and
9731 * prevent mistakes. It shouldn't hurt the GPU, though.
9732 *
9733 * We skip this check on Gfx11+ as the "Stall at Pixel Scoreboard"
9734 * and "Render Target Flush" combo is explicitly required for BTI
9735 * update workarounds.
9736 */
9737 assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
9738 PIPE_CONTROL_RENDER_TARGET_FLUSH)));
9739 }
9740
9741 /* PIPE_CONTROL page workarounds ------------------------------------- */
9742
9743 if (GFX_VER <= 8 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
9744 /* From the PIPE_CONTROL page itself:
9745 *
9746 * "IVB, HSW, BDW
9747 * Restriction: Pipe_control with CS-stall bit set must be issued
9748 * before a pipe-control command that has the State Cache
9749 * Invalidate bit set."
9750 */
9751 flags |= PIPE_CONTROL_CS_STALL;
9752 }
9753
9754 if (flags & PIPE_CONTROL_FLUSH_LLC) {
9755 /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
9756 *
9757 * "Project: ALL
9758 * SW must always program Post-Sync Operation to "Write Immediate
9759 * Data" when Flush LLC is set."
9760 *
9761 * For now, we just require the caller to do it.
9762 */
9763 assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
9764 }
9765
9766 /* Emulate a HDC flush with a full Data Cache Flush on older hardware which
9767 * doesn't support the new lightweight flush.
9768 */
9769 #if GFX_VER < 12
9770 if (flags & PIPE_CONTROL_FLUSH_HDC)
9771 flags |= PIPE_CONTROL_DATA_CACHE_FLUSH;
9772 #endif
9773
9774 /* "Post-Sync Operation" workarounds -------------------------------- */
9775
9776 /* Project: All / Argument: Global Snapshot Count Reset [19]
9777 *
9778 * "This bit must not be exercised on any product.
9779 * Requires stall bit ([20] of DW1) set."
9780 *
9781 * We don't use this, so we just assert that it isn't used. The
9782 * PIPE_CONTROL instruction page indicates that they intended this
9783 * as a debug feature and don't think it is useful in production,
9784 * but it may actually be usable, should we ever want to.
9785 */
9786 assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
9787
9788 if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
9789 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
9790 /* Project: All / Arguments:
9791 *
9792 * - Generic Media State Clear [16]
9793 * - Indirect State Pointers Disable [16]
9794 *
9795 * "Requires stall bit ([20] of DW1) set."
9796 *
9797 * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
9798 * State Clear) says:
9799 *
9800 * "PIPECONTROL command with “Command Streamer Stall Enable” must be
9801 * programmed prior to programming a PIPECONTROL command with "Media
9802 * State Clear" set in GPGPU mode of operation"
9803 *
9804 * This is a subset of the earlier rule, so there's nothing to do.
9805 */
9806 flags |= PIPE_CONTROL_CS_STALL;
9807 }
9808
9809 if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
9810 /* Project: All / Argument: Store Data Index
9811 *
9812 * "Post-Sync Operation ([15:14] of DW1) must be set to something other
9813 * than '0'."
9814 *
9815 * For now, we just assert that the caller does this. We might want to
9816 * automatically add a write to the workaround BO...
9817 */
9818 assert(non_lri_post_sync_flags != 0);
9819 }
9820
9821 if (flags & PIPE_CONTROL_SYNC_GFDT) {
9822 /* Project: All / Argument: Sync GFDT
9823 *
9824 * "Post-Sync Operation ([15:14] of DW1) must be set to something other
9825 * than '0' or 0x2520[13] must be set."
9826 *
9827 * For now, we just assert that the caller does this.
9828 */
9829 assert(non_lri_post_sync_flags != 0);
9830 }
9831
9832 if (flags & PIPE_CONTROL_TLB_INVALIDATE) {
9833 /* Project: IVB+ / Argument: TLB inv
9834 *
9835 * "Requires stall bit ([20] of DW1) set."
9836 *
9837 * Also, from the PIPE_CONTROL instruction table:
9838 *
9839 * "Project: SKL+
9840 * Post Sync Operation or CS stall must be set to ensure a TLB
9841 * invalidation occurs. Otherwise no cycle will occur to the TLB
9842 * cache to invalidate."
9843 *
9844 * This is not a subset of the earlier rule, so there's nothing to do.
9845 */
9846 flags |= PIPE_CONTROL_CS_STALL;
9847 }
9848
9849 if (GFX_VER == 9 && devinfo->gt == 4) {
9850 /* TODO: The big Skylake GT4 post sync op workaround */
9851 }
9852
9853 /* "GPGPU specific workarounds" (both post-sync and flush) ------------ */
9854
9855 if (IS_COMPUTE_PIPELINE(batch)) {
9856 if (GFX_VER >= 9 && (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE)) {
9857 /* SKL PRMs, Volume 7: 3D-Media-GPGPU, Programming Restrictions for
9858 * PIPE_CONTROL, Flush Types:
9859 * "Requires stall bit ([20] of DW) set for all GPGPU Workloads."
9860 * For newer platforms this is documented in the PIPE_CONTROL
9861 * instruction page.
9862 */
9863 flags |= PIPE_CONTROL_CS_STALL;
9864 }
9865
9866 if (GFX_VER == 8 && (post_sync_flags ||
9867 (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
9868 PIPE_CONTROL_DEPTH_STALL |
9869 PIPE_CONTROL_RENDER_TARGET_FLUSH |
9870 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
9871 PIPE_CONTROL_DATA_CACHE_FLUSH)))) {
9872 /* Project: BDW / Arguments:
9873 *
9874 * - LRI Post Sync Operation [23]
9875 * - Post Sync Op [15:14]
9876 * - Notify En [8]
9877 * - Depth Stall [13]
9878 * - Render Target Cache Flush [12]
9879 * - Depth Cache Flush [0]
9880 * - DC Flush Enable [5]
9881 *
9882 * "Requires stall bit ([20] of DW) set for all GPGPU and Media
9883 * Workloads."
9884 */
9885 flags |= PIPE_CONTROL_CS_STALL;
9886
9887 /* Also, from the PIPE_CONTROL instruction table, bit 20:
9888 *
9889 * "Project: BDW
9890 * This bit must be always set when PIPE_CONTROL command is
9891 * programmed by GPGPU and MEDIA workloads, except for the cases
9892 * when only Read Only Cache Invalidation bits are set (State
9893 * Cache Invalidation Enable, Instruction cache Invalidation
9894 * Enable, Texture Cache Invalidation Enable, Constant Cache
9895 * Invalidation Enable). This is to WA FFDOP CG issue, this WA
9896 * need not implemented when FF_DOP_CG is disable via "Fixed
9897 * Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
9898 *
9899 * It sounds like we could avoid CS stalls in some cases, but we
9900 * don't currently bother. This list isn't exactly the list above,
9901 * either...
9902 */
9903 }
9904 }
9905
9906 /* "Stall" workarounds ----------------------------------------------
9907 * These have to come after the earlier ones because we may have added
9908 * some additional CS stalls above.
9909 */
9910
9911 if (GFX_VER < 9 && (flags & PIPE_CONTROL_CS_STALL)) {
9912 /* Project: PRE-SKL, VLV, CHV
9913 *
9914 * "[All Stepping][All SKUs]:
9915 *
9916 * One of the following must also be set:
9917 *
9918 * - Render Target Cache Flush Enable ([12] of DW1)
9919 * - Depth Cache Flush Enable ([0] of DW1)
9920 * - Stall at Pixel Scoreboard ([1] of DW1)
9921 * - Depth Stall ([13] of DW1)
9922 * - Post-Sync Operation ([13] of DW1)
9923 * - DC Flush Enable ([5] of DW1)"
9924 *
9925 * If we don't already have one of those bits set, we choose to add
9926 * "Stall at Pixel Scoreboard". Some of the other bits require a
9927 * CS stall as a workaround (see above), which would send us into
9928 * an infinite recursion of PIPE_CONTROLs. "Stall at Pixel Scoreboard"
9929 * appears to be safe, so we choose that.
9930 */
9931 const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
9932 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
9933 PIPE_CONTROL_WRITE_IMMEDIATE |
9934 PIPE_CONTROL_WRITE_DEPTH_COUNT |
9935 PIPE_CONTROL_WRITE_TIMESTAMP |
9936 PIPE_CONTROL_STALL_AT_SCOREBOARD |
9937 PIPE_CONTROL_DEPTH_STALL |
9938 PIPE_CONTROL_DATA_CACHE_FLUSH;
9939 if (!(flags & wa_bits))
9940 flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
9941 }
9942
9943 if (INTEL_NEEDS_WA_1409600907 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
9944 /* Wa_1409600907:
9945 *
9946 * "PIPE_CONTROL with Depth Stall Enable bit must be set
9947 * with any PIPE_CONTROL with Depth Flush Enable bit set.
9948 */
9949 flags |= PIPE_CONTROL_DEPTH_STALL;
9950 }
9951
9952 /* Wa_14014966230: For COMPUTE Workload - Any PIPE_CONTROL command with
9953 * POST_SYNC Operation Enabled MUST be preceded by a PIPE_CONTROL
9954 * with CS_STALL Bit set (with No POST_SYNC ENABLED)
9955 */
9956 if (intel_device_info_is_adln(devinfo) &&
9957 IS_COMPUTE_PIPELINE(batch) &&
9958 flags_to_post_sync_op(flags) != NoWrite) {
9959 iris_emit_raw_pipe_control(batch, "Wa_14014966230",
9960 PIPE_CONTROL_CS_STALL, NULL, 0, 0);
9961 }
9962
9963 batch_mark_sync_for_pipe_control(batch, flags);
9964
9965 #if INTEL_NEEDS_WA_14010840176
9966 /* "If the intention of “constant cache invalidate” is
9967 * to invalidate the L1 cache (which can cache constants), use “HDC
9968 * pipeline flush” instead of Constant Cache invalidate command."
9969 *
9970 * "If L3 invalidate is needed, the w/a should be to set state invalidate
9971 * in the pipe control command, in addition to the HDC pipeline flush."
9972 */
9973 if (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) {
9974 flags &= ~PIPE_CONTROL_CONST_CACHE_INVALIDATE;
9975 flags |= PIPE_CONTROL_FLUSH_HDC | PIPE_CONTROL_STATE_CACHE_INVALIDATE;
9976 }
9977 #endif
9978
9979 /* Emit --------------------------------------------------------------- */
9980
9981 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
9982 fprintf(stderr,
9983 " PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
9984 (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
9985 (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
9986 (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
9987 (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
9988 (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
9989 (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
9990 (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
9991 (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
9992 (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
9993 (flags & PIPE_CONTROL_TILE_CACHE_FLUSH) ? "Tile " : "",
9994 (flags & PIPE_CONTROL_CCS_CACHE_FLUSH) ? "CCS " : "",
9995 (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
9996 (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
9997 (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
9998 (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
9999 (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
10000 (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
10001 (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
10002 "SnapRes" : "",
10003 (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
10004 "ISPDis" : "",
10005 (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
10006 (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
10007 (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
10008 (flags & PIPE_CONTROL_FLUSH_HDC) ? "HDC " : "",
10009 (flags & PIPE_CONTROL_PSS_STALL_SYNC) ? "PSS " : "",
10010 (flags & PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH) ? "UntypedDataPortCache " : "",
10011 imm, reason);
10012 }
10013
10014 iris_batch_sync_region_start(batch);
10015
10016 const bool trace_pc =
10017 (flags & (PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CACHE_INVALIDATE_BITS)) != 0;
10018
10019 if (trace_pc)
10020 trace_intel_begin_stall(&batch->trace);
10021
10022 iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
10023 #if GFX_VERx10 >= 125
10024 pc.PSSStallSyncEnable = flags & PIPE_CONTROL_PSS_STALL_SYNC;
10025 #endif
10026 #if GFX_VER == 12
10027 pc.TileCacheFlushEnable = flags & PIPE_CONTROL_TILE_CACHE_FLUSH;
10028 #endif
10029 #if GFX_VER > 11
10030 pc.HDCPipelineFlushEnable = flags & PIPE_CONTROL_FLUSH_HDC;
10031 #endif
10032 #if GFX_VERx10 >= 125
10033 pc.UntypedDataPortCacheFlushEnable =
10034 (flags & (PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
10035 PIPE_CONTROL_FLUSH_HDC |
10036 PIPE_CONTROL_DATA_CACHE_FLUSH)) &&
10037 IS_COMPUTE_PIPELINE(batch);
10038 pc.HDCPipelineFlushEnable |= pc.UntypedDataPortCacheFlushEnable;
10039 pc.CCSFlushEnable |= flags & PIPE_CONTROL_CCS_CACHE_FLUSH;
10040 #endif
10041 pc.LRIPostSyncOperation = NoLRIOperation;
10042 pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
10043 pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
10044 pc.StoreDataIndex = 0;
10045 pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
10046 #if GFX_VERx10 < 125
10047 pc.GlobalSnapshotCountReset =
10048 flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
10049 #endif
10050 pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
10051 #if GFX_VERx10 < 200
10052 pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
10053 #endif
10054 pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
10055 pc.RenderTargetCacheFlushEnable =
10056 flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
10057 pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
10058 pc.StateCacheInvalidationEnable =
10059 flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
10060 #if GFX_VER >= 12
10061 pc.L3ReadOnlyCacheInvalidationEnable =
10062 flags & PIPE_CONTROL_L3_READ_ONLY_CACHE_INVALIDATE;
10063 #endif
10064 pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
10065 pc.ConstantCacheInvalidationEnable =
10066 flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
10067 pc.PostSyncOperation = flags_to_post_sync_op(flags);
10068 pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
10069 pc.InstructionCacheInvalidateEnable =
10070 flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
10071 pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
10072 pc.IndirectStatePointersDisable =
10073 flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
10074 pc.TextureCacheInvalidationEnable =
10075 flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
10076 pc.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
10077 pc.ImmediateData = imm;
10078 }
10079
10080 if (trace_pc) {
10081 trace_intel_end_stall(&batch->trace, flags,
10082 iris_utrace_pipe_flush_bit_to_ds_stall_flag,
10083 reason,0,0,0);
10084 }
10085
10086 iris_batch_sync_region_end(batch);
10087 }
10088
10089 #if GFX_VER == 9
10090 /**
10091 * Preemption on Gfx9 has to be enabled or disabled in various cases.
10092 *
10093 * See these workarounds for preemption:
10094 * - WaDisableMidObjectPreemptionForGSLineStripAdj
10095 * - WaDisableMidObjectPreemptionForTrifanOrPolygon
10096 * - WaDisableMidObjectPreemptionForLineLoop
10097 * - WA#0798
10098 *
10099 * We don't put this in the vtable because it's only used on Gfx9.
10100 */
10101 void
gfx9_toggle_preemption(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw)10102 gfx9_toggle_preemption(struct iris_context *ice,
10103 struct iris_batch *batch,
10104 const struct pipe_draw_info *draw)
10105 {
10106 struct iris_genx_state *genx = ice->state.genx;
10107 bool object_preemption = true;
10108
10109 /* WaDisableMidObjectPreemptionForGSLineStripAdj
10110 *
10111 * "WA: Disable mid-draw preemption when draw-call is a linestrip_adj
10112 * and GS is enabled."
10113 */
10114 if (draw->mode == MESA_PRIM_LINE_STRIP_ADJACENCY &&
10115 ice->shaders.prog[MESA_SHADER_GEOMETRY])
10116 object_preemption = false;
10117
10118 /* WaDisableMidObjectPreemptionForTrifanOrPolygon
10119 *
10120 * "TriFan miscompare in Execlist Preemption test. Cut index that is
10121 * on a previous context. End the previous, the resume another context
10122 * with a tri-fan or polygon, and the vertex count is corrupted. If we
10123 * prempt again we will cause corruption.
10124 *
10125 * WA: Disable mid-draw preemption when draw-call has a tri-fan."
10126 */
10127 if (draw->mode == MESA_PRIM_TRIANGLE_FAN)
10128 object_preemption = false;
10129
10130 /* WaDisableMidObjectPreemptionForLineLoop
10131 *
10132 * "VF Stats Counters Missing a vertex when preemption enabled.
10133 *
10134 * WA: Disable mid-draw preemption when the draw uses a lineloop
10135 * topology."
10136 */
10137 if (draw->mode == MESA_PRIM_LINE_LOOP)
10138 object_preemption = false;
10139
10140 /* WA#0798
10141 *
10142 * "VF is corrupting GAFS data when preempted on an instance boundary
10143 * and replayed with instancing enabled.
10144 *
10145 * WA: Disable preemption when using instanceing."
10146 */
10147 if (draw->instance_count > 1)
10148 object_preemption = false;
10149
10150 if (genx->object_preemption != object_preemption) {
10151 iris_enable_obj_preemption(batch, object_preemption);
10152 genx->object_preemption = object_preemption;
10153 }
10154 }
10155 #endif
10156
10157 static void
iris_lost_genx_state(struct iris_context * ice,struct iris_batch * batch)10158 iris_lost_genx_state(struct iris_context *ice, struct iris_batch *batch)
10159 {
10160 struct iris_genx_state *genx = ice->state.genx;
10161
10162 #if INTEL_NEEDS_WA_1808121037
10163 genx->depth_reg_mode = IRIS_DEPTH_REG_MODE_UNKNOWN;
10164 #endif
10165
10166 memset(genx->last_index_buffer, 0, sizeof(genx->last_index_buffer));
10167 }
10168
10169 static void
iris_emit_mi_report_perf_count(struct iris_batch * batch,struct iris_bo * bo,uint32_t offset_in_bytes,uint32_t report_id)10170 iris_emit_mi_report_perf_count(struct iris_batch *batch,
10171 struct iris_bo *bo,
10172 uint32_t offset_in_bytes,
10173 uint32_t report_id)
10174 {
10175 iris_batch_sync_region_start(batch);
10176 iris_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
10177 mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes,
10178 IRIS_DOMAIN_OTHER_WRITE);
10179 mi_rpc.ReportID = report_id;
10180 }
10181 iris_batch_sync_region_end(batch);
10182 }
10183
10184 /**
10185 * Update the pixel hashing modes that determine the balancing of PS threads
10186 * across subslices and slices.
10187 *
10188 * \param width Width bound of the rendering area (already scaled down if \p
10189 * scale is greater than 1).
10190 * \param height Height bound of the rendering area (already scaled down if \p
10191 * scale is greater than 1).
10192 * \param scale The number of framebuffer samples that could potentially be
10193 * affected by an individual channel of the PS thread. This is
10194 * typically one for single-sampled rendering, but for operations
10195 * like CCS resolves and fast clears a single PS invocation may
10196 * update a huge number of pixels, in which case a finer
10197 * balancing is desirable in order to maximally utilize the
10198 * bandwidth available. UINT_MAX can be used as shorthand for
10199 * "finest hashing mode available".
10200 */
10201 void
genX(emit_hashing_mode)10202 genX(emit_hashing_mode)(struct iris_context *ice, struct iris_batch *batch,
10203 unsigned width, unsigned height, unsigned scale)
10204 {
10205 #if GFX_VER == 9
10206 const struct intel_device_info *devinfo = batch->screen->devinfo;
10207 const unsigned slice_hashing[] = {
10208 /* Because all Gfx9 platforms with more than one slice require
10209 * three-way subslice hashing, a single "normal" 16x16 slice hashing
10210 * block is guaranteed to suffer from substantial imbalance, with one
10211 * subslice receiving twice as much work as the other two in the
10212 * slice.
10213 *
10214 * The performance impact of that would be particularly severe when
10215 * three-way hashing is also in use for slice balancing (which is the
10216 * case for all Gfx9 GT4 platforms), because one of the slices
10217 * receives one every three 16x16 blocks in either direction, which
10218 * is roughly the periodicity of the underlying subslice imbalance
10219 * pattern ("roughly" because in reality the hardware's
10220 * implementation of three-way hashing doesn't do exact modulo 3
10221 * arithmetic, which somewhat decreases the magnitude of this effect
10222 * in practice). This leads to a systematic subslice imbalance
10223 * within that slice regardless of the size of the primitive. The
10224 * 32x32 hashing mode guarantees that the subslice imbalance within a
10225 * single slice hashing block is minimal, largely eliminating this
10226 * effect.
10227 */
10228 _32x32,
10229 /* Finest slice hashing mode available. */
10230 NORMAL
10231 };
10232 const unsigned subslice_hashing[] = {
10233 /* 16x16 would provide a slight cache locality benefit especially
10234 * visible in the sampler L1 cache efficiency of low-bandwidth
10235 * non-LLC platforms, but it comes at the cost of greater subslice
10236 * imbalance for primitives of dimensions approximately intermediate
10237 * between 16x4 and 16x16.
10238 */
10239 _16x4,
10240 /* Finest subslice hashing mode available. */
10241 _8x4
10242 };
10243 /* Dimensions of the smallest hashing block of a given hashing mode. If
10244 * the rendering area is smaller than this there can't possibly be any
10245 * benefit from switching to this mode, so we optimize out the
10246 * transition.
10247 */
10248 const unsigned min_size[][2] = {
10249 { 16, 4 },
10250 { 8, 4 }
10251 };
10252 const unsigned idx = scale > 1;
10253
10254 if (width > min_size[idx][0] || height > min_size[idx][1]) {
10255 iris_emit_raw_pipe_control(batch,
10256 "workaround: CS stall before GT_MODE LRI",
10257 PIPE_CONTROL_STALL_AT_SCOREBOARD |
10258 PIPE_CONTROL_CS_STALL,
10259 NULL, 0, 0);
10260
10261 iris_emit_reg(batch, GENX(GT_MODE), reg) {
10262 reg.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
10263 reg.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
10264 reg.SubsliceHashing = subslice_hashing[idx];
10265 reg.SubsliceHashingMask = -1;
10266 };
10267
10268 ice->state.current_hash_scale = scale;
10269 }
10270 #endif
10271 }
10272
10273 static void
iris_set_frontend_noop(struct pipe_context * ctx,bool enable)10274 iris_set_frontend_noop(struct pipe_context *ctx, bool enable)
10275 {
10276 struct iris_context *ice = (struct iris_context *) ctx;
10277
10278 if (iris_batch_prepare_noop(&ice->batches[IRIS_BATCH_RENDER], enable)) {
10279 ice->state.dirty |= IRIS_ALL_DIRTY_FOR_RENDER;
10280 ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_RENDER;
10281 }
10282
10283 if (iris_batch_prepare_noop(&ice->batches[IRIS_BATCH_COMPUTE], enable)) {
10284 ice->state.dirty |= IRIS_ALL_DIRTY_FOR_COMPUTE;
10285 ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_COMPUTE;
10286 }
10287 }
10288
10289 void
genX(init_screen_state)10290 genX(init_screen_state)(struct iris_screen *screen)
10291 {
10292 assert(screen->devinfo->verx10 == GFX_VERx10);
10293 screen->vtbl.destroy_state = iris_destroy_state;
10294 screen->vtbl.init_render_context = iris_init_render_context;
10295 screen->vtbl.init_compute_context = iris_init_compute_context;
10296 screen->vtbl.init_copy_context = iris_init_copy_context;
10297 screen->vtbl.upload_render_state = iris_upload_render_state;
10298 screen->vtbl.upload_indirect_render_state = iris_upload_indirect_render_state;
10299 screen->vtbl.upload_indirect_shader_render_state = iris_upload_indirect_shader_render_state;
10300 screen->vtbl.update_binder_address = iris_update_binder_address;
10301 screen->vtbl.upload_compute_state = iris_upload_compute_state;
10302 screen->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control;
10303 screen->vtbl.rewrite_compute_walker_pc = iris_rewrite_compute_walker_pc;
10304 screen->vtbl.emit_mi_report_perf_count = iris_emit_mi_report_perf_count;
10305 screen->vtbl.rebind_buffer = iris_rebind_buffer;
10306 screen->vtbl.load_register_reg32 = iris_load_register_reg32;
10307 screen->vtbl.load_register_reg64 = iris_load_register_reg64;
10308 screen->vtbl.load_register_imm32 = iris_load_register_imm32;
10309 screen->vtbl.load_register_imm64 = iris_load_register_imm64;
10310 screen->vtbl.load_register_mem32 = iris_load_register_mem32;
10311 screen->vtbl.load_register_mem64 = iris_load_register_mem64;
10312 screen->vtbl.store_register_mem32 = iris_store_register_mem32;
10313 screen->vtbl.store_register_mem64 = iris_store_register_mem64;
10314 screen->vtbl.store_data_imm32 = iris_store_data_imm32;
10315 screen->vtbl.store_data_imm64 = iris_store_data_imm64;
10316 screen->vtbl.copy_mem_mem = iris_copy_mem_mem;
10317 screen->vtbl.derived_program_state_size = iris_derived_program_state_size;
10318 screen->vtbl.store_derived_program_state = iris_store_derived_program_state;
10319 screen->vtbl.create_so_decl_list = iris_create_so_decl_list;
10320 screen->vtbl.populate_vs_key = iris_populate_vs_key;
10321 screen->vtbl.populate_tcs_key = iris_populate_tcs_key;
10322 screen->vtbl.populate_tes_key = iris_populate_tes_key;
10323 screen->vtbl.populate_gs_key = iris_populate_gs_key;
10324 screen->vtbl.populate_fs_key = iris_populate_fs_key;
10325 screen->vtbl.populate_cs_key = iris_populate_cs_key;
10326 screen->vtbl.lost_genx_state = iris_lost_genx_state;
10327 screen->vtbl.disable_rhwo_optimization = iris_disable_rhwo_optimization;
10328 }
10329
10330 void
genX(init_state)10331 genX(init_state)(struct iris_context *ice)
10332 {
10333 struct pipe_context *ctx = &ice->ctx;
10334 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
10335
10336 ctx->create_blend_state = iris_create_blend_state;
10337 ctx->create_depth_stencil_alpha_state = iris_create_zsa_state;
10338 ctx->create_rasterizer_state = iris_create_rasterizer_state;
10339 ctx->create_sampler_state = iris_create_sampler_state;
10340 ctx->create_sampler_view = iris_create_sampler_view;
10341 ctx->create_surface = iris_create_surface;
10342 ctx->create_vertex_elements_state = iris_create_vertex_elements;
10343 ctx->bind_blend_state = iris_bind_blend_state;
10344 ctx->bind_depth_stencil_alpha_state = iris_bind_zsa_state;
10345 ctx->bind_sampler_states = iris_bind_sampler_states;
10346 ctx->bind_rasterizer_state = iris_bind_rasterizer_state;
10347 ctx->bind_vertex_elements_state = iris_bind_vertex_elements_state;
10348 ctx->delete_blend_state = iris_delete_state;
10349 ctx->delete_depth_stencil_alpha_state = iris_delete_state;
10350 ctx->delete_rasterizer_state = iris_delete_state;
10351 ctx->delete_sampler_state = iris_delete_state;
10352 ctx->delete_vertex_elements_state = iris_delete_state;
10353 ctx->set_blend_color = iris_set_blend_color;
10354 ctx->set_clip_state = iris_set_clip_state;
10355 ctx->set_constant_buffer = iris_set_constant_buffer;
10356 ctx->set_shader_buffers = iris_set_shader_buffers;
10357 ctx->set_shader_images = iris_set_shader_images;
10358 ctx->set_sampler_views = iris_set_sampler_views;
10359 ctx->set_compute_resources = iris_set_compute_resources;
10360 ctx->set_global_binding = iris_set_global_binding;
10361 ctx->set_tess_state = iris_set_tess_state;
10362 ctx->set_patch_vertices = iris_set_patch_vertices;
10363 ctx->set_framebuffer_state = iris_set_framebuffer_state;
10364 ctx->set_polygon_stipple = iris_set_polygon_stipple;
10365 ctx->set_sample_mask = iris_set_sample_mask;
10366 ctx->set_scissor_states = iris_set_scissor_states;
10367 ctx->set_stencil_ref = iris_set_stencil_ref;
10368 ctx->set_vertex_buffers = iris_set_vertex_buffers;
10369 ctx->set_viewport_states = iris_set_viewport_states;
10370 ctx->sampler_view_destroy = iris_sampler_view_destroy;
10371 ctx->surface_destroy = iris_surface_destroy;
10372 ctx->draw_vbo = iris_draw_vbo;
10373 ctx->launch_grid = iris_launch_grid;
10374 ctx->create_stream_output_target = iris_create_stream_output_target;
10375 ctx->stream_output_target_destroy = iris_stream_output_target_destroy;
10376 ctx->set_stream_output_targets = iris_set_stream_output_targets;
10377 ctx->set_frontend_noop = iris_set_frontend_noop;
10378
10379 ice->state.dirty = ~0ull;
10380 ice->state.stage_dirty = ~0ull;
10381
10382 ice->state.statistics_counters_enabled = true;
10383
10384 ice->state.sample_mask = 0xffff;
10385 ice->state.num_viewports = 1;
10386 ice->state.prim_mode = MESA_PRIM_COUNT;
10387 ice->state.genx = calloc(1, sizeof(struct iris_genx_state));
10388 ice->draw.derived_params.drawid = -1;
10389
10390 #if GFX_VERx10 >= 120
10391 ice->state.genx->object_preemption = true;
10392 #endif
10393
10394 /* Make a 1x1x1 null surface for unbound textures */
10395 void *null_surf_map =
10396 upload_state(ice->state.surface_uploader, &ice->state.unbound_tex,
10397 4 * GENX(RENDER_SURFACE_STATE_length), 64);
10398 isl_null_fill_state(&screen->isl_dev, null_surf_map,
10399 .size = isl_extent3d(1, 1, 1));
10400 ice->state.unbound_tex.offset +=
10401 iris_bo_offset_from_base_address(iris_resource_bo(ice->state.unbound_tex.res));
10402
10403 /* Default all scissor rectangles to be empty regions. */
10404 for (int i = 0; i < IRIS_MAX_VIEWPORTS; i++) {
10405 ice->state.scissors[i] = (struct pipe_scissor_state) {
10406 .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
10407 };
10408 }
10409 }
10410